From 120bbede4274f254001033eecb6be6c944357d51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 4 Sep 2024 17:51:12 -0400
Subject: [PATCH 01/41] include the new device base class

---
 .../lightning_gpu/_state_vector.py            | 270 +++++++++++++++++-
 .../lightning_gpu/lightning_gpu.py            | 163 +++++++++--
 pyproject.toml                                |   8 +-
 3 files changed, 416 insertions(+), 25 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index d52875e337..3f70d1705d 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -14,11 +14,36 @@
 """
 Class implementation for lightning_gpu state-vector manipulation.
 """
+try:
+    from pennylane_lightning.lightning_gpu_ops import (
+        StateVectorC64,
+        StateVectorC128,
+    )
+    
+    try: # Try to import the MPI modules
+        # pylint: disable=no-name-in-module
+        from pennylane_lightning.lightning_gpu_ops import (
+            StateVectorMPIC64,
+            StateVectorMPIC128,
+        )
+
+        MPI_SUPPORT = True
+    except ImportError:
+        MPI_SUPPORT = False
+        
+except ImportError:
+    pass
+
+from itertools import product
 
 import numpy as np
 import pennylane as qml
 from pennylane import DeviceError
+from pennylane.ops.op_math import Adjoint
 from pennylane.wires import Wires
+from pennylane.measurements import MidMeasureMP
+from pennylane.ops import Conditional
+
 
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
 
@@ -37,7 +62,10 @@ class LightningGPUStateVector(LightningBaseStateVector):
         device_name(string): state vector device name. Options: ["lightning.gpu"]
     """
 
-    def __init__(self, num_wires, dtype=np.complex128, device_name="lightning.gpu"):
+    def __init__(self, num_wires, dtype=np.complex128, device_name="lightning.gpu", 
+                 mpi_handler = None, 
+                 sync=True,
+                 ):
 
         if device_name != "lightning.gpu":
             raise DeviceError(f'The device name "{device_name}" is not a valid option.')
@@ -45,3 +73,243 @@ def __init__(self, num_wires, dtype=np.complex128, device_name="lightning.gpu"):
         super().__init__(num_wires, dtype)
 
         self._device_name = device_name
+
+        self._num_global_wires = self._mpi_handler.num_global_wires
+        self._num_local_wires = self._mpi_handler.num_local_wires
+
+        self._dtype = dtype
+        self._mpi_handler = mpi_handler
+        self._sync = sync
+
+        self._wires = Wires(range(num_wires))
+        
+        if self._mpi_handler.use_mpi:
+            self._lgpu_state = self._state_dtype()(
+                self._mpi_handler.mpi_manager,
+                self._mpi_handler.devtag,
+                self._mpi_handler.mpi_buf_size,
+                self._mpi_handler.num_global_wires,
+                self._mpi_handler.num_local_wires,
+            )
+
+        if not self._mpi_handler.use_mpi:
+            self._lgpu_state = self._state_dtype()(self.num_wires)
+            
+    @property
+    def dtype(self):
+        """Returns the state vector data type."""
+        return self._dtype
+
+    @property
+    def device_name(self):
+        """Returns the state vector device name."""
+        return self._device_name
+
+    @property
+    def wires(self):
+        """All wires that can be addressed on this device"""
+        return self._wires
+
+    @property
+    def num_wires(self):
+        """Number of wires addressed on this device"""
+        return self._num_wires
+    
+    @property
+    def state_vector(self):
+        """Returns a handle to the state vector."""
+        return self._lgpu_state
+
+    @property
+    def state(self):
+        """Copy the state vector data from the device to the host.
+
+        A state vector Numpy array is explicitly allocated on the host to store and return the data.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> print(dev.state)
+        [0.+0.j 1.+0.j]
+        """
+        state = np.zeros(1 << self._num_local_wires, dtype=self.dtype)
+        self.syncD2H(state)
+        return state
+
+    def _state_dtype(self):
+        """Binding to Lightning Managed state vector C++ class.
+
+        Returns: the state vector class
+        """
+        if self._mpi_handler.use_mpi:
+            return StateVectorMPIC128 if self.dtype == np.complex128 else StateVectorMPIC64
+        else:
+            return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64
+
+    def reset_state(self):
+        """Reset the device's state"""
+        # init the state vector to |00..0>
+        self._gpu_state.resetGPU(False)  # Sync reset
+
+        self._lgpu_state.resetStateVector()
+
+    def syncD2H(self, state_vector, use_async=False):
+        """Copy the state vector data on device to a state vector on the host provided by the user
+        Args:
+            state_vector(array[complex]): the state vector array on host
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+
+        **Example**
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.C_DTYPE)
+        >>> dev.syncD2H(state_vector)
+        >>> print(state_vector)
+        [0.+0.j 1.+0.j]
+        """
+        self._lgpu_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
+
+    def syncH2D(self, state_vector, use_async=False):
+        """Copy the state vector data on host provided by the user to the state vector on the device
+        Args:
+            state_vector(array[complex]): the state vector array on host.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+
+        **Example**
+        >>> dev = qml.device('lightning.gpu', wires=3)
+        >>> obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
+        >>> obs1 = qml.Identity(1)
+        >>> H = qml.Hamiltonian([1.0, 1.0], [obs1, obs])
+        >>> state_vector = np.array([0.0 + 0.0j, 0.0 + 0.1j, 0.1 + 0.1j, 0.1 + 0.2j,
+            0.2 + 0.2j, 0.3 + 0.3j, 0.3 + 0.4j, 0.4 + 0.5j,], dtype=np.complex64,)
+        >>> dev.syncH2D(state_vector)
+        >>> res = dev.expval(H)
+        >>> print(res)
+        1.0
+        """
+        self._lgpu_state.HostToDevice(state_vector.ravel(order="C"), use_async)
+
+    def _apply_state_vector(self, state, device_wires, use_async=False):
+        """Initialize the state vector on GPU with a specified state on host.
+        Note that any use of this method will introduce host-overheads.
+        Args:
+        state (array[complex]): normalized input state (on host) of length ``2**len(wires)``
+                or broadcasted state of shape ``(batch_size, 2**len(wires))``
+        device_wires (Wires): wires that get initialized in the state
+        use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+        Note: This function only supports synchronized memory copy from host to device.
+        """
+        # translate to wire labels used by device
+        # device_wires = self.map_wires(device_wires)
+
+        # state = self._asarray(state, dtype=self.C_DTYPE)  # this operation on host
+        output_shape = [2] * self._num_local_wires
+
+        if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
+            # Initialize the entire device state with the input state
+            if self.num_wires == self._num_local_wires:
+                self.syncH2D(self._reshape(state, output_shape))
+                return
+            local_state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
+            self._mpi_handler.mpi_manager.Scatter(state, local_state, 0)
+            # Initialize the entire device state with the input state
+            # self.syncH2D(self._reshape(local_state, output_shape))
+            self.syncH2D(np.reshape(local_state, output_shape))
+            return
+
+        # generate basis states on subset of qubits via the cartesian product
+        basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
+
+        # get basis states to alter on full set of qubits
+        unravelled_indices = np.zeros((2 ** len(device_wires), self.num_wires), dtype=int)
+        unravelled_indices[:, device_wires] = basis_states
+
+        # get indices for which the state is changed to input state vector elements
+        ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires)
+
+        # set the state vector on GPU with the unravelled_indices and their corresponding values
+        self._lgpu_state.setStateVector(
+            ravelled_indices, state, use_async
+        )  # this operation on device
+
+    def _apply_basis_state(self, state, wires):
+        """Initialize the state vector in a specified computational basis state on GPU directly.
+            Args:
+            state (array[int]): computational basis state (on host) of shape ``(wires,)``
+                consisting of 0s and 1s.
+            wires (Wires): wires that the provided computational state should be initialized on
+        Note: This function does not support broadcasted inputs yet.
+        """
+        # # translate to wire labels used by device
+        # device_wires = self.map_wires(wires)
+
+        # length of basis state parameter
+        if not set(state.tolist()).issubset({0, 1}):
+            raise ValueError("BasisState parameter must consist of 0 or 1 integers.")
+
+        if len(state) != len(wires):
+            raise ValueError("BasisState parameter and wires must be of equal length.")
+
+        self._lgpu_state.setStateVector(list(state), list(wires))
+        # # get computational basis state number
+        # basis_states = 2 ** (self.num_wires - 1 - np.array(list(wires)))
+        # basis_states = qml.math.convert_like(basis_states, state)
+        # num = int(qml.math.dot(state, basis_states))
+
+        # self._create_basis_state(num)
+
+    def _apply_lightning(
+        self, operations, mid_measurements: dict = None, postselect_mode: str = None
+    ):
+        """Apply a list of operations to the state tensor.
+
+        Args:
+            operations (list[~pennylane.operation.Operation]): operations to apply
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots. Default is ``None``.
+
+        Returns:
+            None
+        """
+        state = self.state_vector
+
+        # Skip over identity operations instead of performing
+        # matrix multiplication with it.
+        for operation in operations:
+            if isinstance(operation, qml.Identity):
+                continue
+            if isinstance(operation, Adjoint):
+                name = operation.base.name
+                invert_param = True
+            else:
+                name = operation.name
+                invert_param = False
+            method = getattr(state, name, None)
+            wires = list(operation.wires)
+
+            if isinstance(operation, Conditional):
+                if operation.meas_val.concretize(mid_measurements):
+                    self._apply_lightning([operation.base])
+            elif isinstance(operation, MidMeasureMP):
+                self._apply_lightning_midmeasure(
+                    operation, mid_measurements, postselect_mode=postselect_mode
+                )
+            elif method is not None:  # apply specialized gate
+                param = operation.parameters
+                method(wires, invert_param, param)
+            elif isinstance(operation, qml.ops.Controlled):  # apply n-controlled gate
+                self._apply_lightning_controlled(operation)
+            else:  # apply gate as a matrix
+                # Inverse can be set to False since qml.matrix(operation) is already in
+                # inverted form
+                method = getattr(state, "applyMatrix")
+                try:
+                    method(qml.matrix(operation), wires, False)
+                except AttributeError:  # pragma: no cover
+                    # To support older versions of PL
+                    method(operation.matrix, wires, False)
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 49f1cdcab2..3f2c36a75b 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -44,37 +44,30 @@
 
 try:
 
-    from pennylane_lightning.lightning_gpu_ops import backend_info
+    from pennylane_lightning.lightning_gpu_ops import (
+        DevPool,
+        backend_info,
+        get_gpu_arch,
+        is_gpu_supported,
+    )
+
+    LGPU_CPP_BINARY_AVAILABLE = True
 
     try:
         # pylint: disable=no-name-in-module
-        from pennylane_lightning.lightning_gpu_ops import MPIManager
-
+        from pennylane_lightning.lightning_gpu_ops import (
+            DevTag,
+            MPIManager,
+        )            
         MPI_SUPPORT = True
     except ImportError:
         MPI_SUPPORT = False
 
-    if find_library("custatevec") is None and not imp_util.find_spec("cuquantum"):
-        raise ImportError(
-            "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
-        )
-    # if not DevPool.getTotalDevices():  # pragma: no cover
-    #     raise ValueError("No supported CUDA-capable device found")
-
-    # if not is_gpu_supported():  # pragma: no cover
-    #     raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
-
-    LGPU_CPP_BINARY_AVAILABLE = True
-
 except (ImportError, ValueError) as e:
     backend_info = None
     LGPU_CPP_BINARY_AVAILABLE = False
 
 
-def _mebibytesToBytes(mebibytes):
-    return mebibytes * 1024 * 1024
-
-
 _operations = frozenset(
     {
         "Identity",
@@ -167,6 +160,111 @@ def _mebibytesToBytes(mebibytes):
     qml.QubitUnitary,
 )
 
+# MPI options
+class LightningGPU_MPIHandler():
+    """MPI handler for PennyLane Lightning GPU device  
+    
+    MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
+    
+    Use the MPI library is necessary to initialize different variables and methods to handle the data across  nodes and perform checks for memory allocation on each device. 
+    
+    Args:
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        dev_pool (Callable): Method to handle the GPU devices available.
+        num_wires (int): the number of wires to initialize the device wit.h 
+        c_dtype (np.complex64, np.complex128): Datatypes for statevector representation
+        
+    """
+
+    def __init__(self, 
+                 mpi: bool, 
+                 mpi_buf_size: int, 
+                 dev_pool: Callable, 
+                 num_wires: int, 
+                 c_dtype: Union[np.complex64, np.complex128]) -> None:
+        
+        self.use_mpi = mpi
+        self.mpi_but_size = mpi_buf_size
+        self._dp = dev_pool
+        
+        if self.use_mpi: 
+            
+            if not MPI_SUPPORT:
+                raise ImportError("MPI related APIs are not found.")
+
+            if mpi_buf_size < 0:
+                raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
+
+            if (mpi_buf_size > 0 
+                and (mpi_buf_size & (mpi_buf_size - 1))):
+
+                raise ValueError(f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2.")
+            
+            # After check if all MPI parameter are ok
+            self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
+
+            # set the number of global and local wires
+            commSize = self._mpi_manager.getSize()
+            self.num_global_wires = commSize.bit_length() - 1
+            self.num_local_wires = num_wires - self._num_global_wires
+            
+            # Memory size in bytes
+            sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
+            if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
+                raise ValueError("The MPI buffer size is larger than the local state vector size.")
+
+        if not self.use_mpi: 
+            self.num_local_wires = num_wires
+
+    def _mebibytesToBytes(mebibytes):
+        return mebibytes * 1024 * 1024
+    
+    def _mpi_init_helper(self, num_wires):
+        """Set up MPI checks and initializations."""
+        
+        # initialize MPIManager and config check in the MPIManager ctor
+        mpi_manager = MPIManager()
+        
+        # check if number of GPUs per node is larger than number of processes per node
+        numDevices = self._dp.getTotalDevices()
+        numProcsNode = mpi_manager.getSizeNode()
+        
+        if numDevices < numProcsNode:
+            raise ValueError(
+                "Number of devices should be larger than or equal to the number of processes on each node."
+            )
+        
+        # check if the process number is larger than number of statevector elements
+        if mpi_manager.getSize() > (1 << (num_wires - 1)):
+            raise ValueError(
+                "Number of processes should be smaller than the number of statevector elements."
+            )
+        
+        # set GPU device
+        rank = self._mpi_manager.getRank()
+        deviceid = rank % numProcsNode
+        self._dp.setDeviceID(deviceid)
+        devtag = DevTag(deviceid)
+        
+        return (mpi_manager, devtag)
+
+
+def check_gpu_resources() -> None:
+    """ Check the available resources of each Nvidia GPU """
+    if (find_library("custatevec") is None 
+        and not imp_util.find_spec("cuquantum")):
+        
+        raise ImportError(
+            "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
+        )
+        
+    if not DevPool.getTotalDevices():
+        raise ValueError("No supported CUDA-capable device found")
+
+    if not is_gpu_supported():
+        raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
+    
 
 def stopping_condition(op: Operator) -> bool:
     """A function that determines whether or not an operation is supported by ``lightning.gpu``."""
@@ -243,6 +341,9 @@ class LightningGPU(LightningBase):
         batch_obs (bool): Determine whether we process observables in parallel when
             computing the jacobian. This value is only relevant when the lightning.gpu
             is built with MPI. Default is False.
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        sync (bool): immediately sync with host-sv after applying operation
     """
 
     # General device options
@@ -269,7 +370,10 @@ def __init__(  # pylint: disable=too-many-arguments
         c_dtype=np.complex128,
         shots=None,
         batch_obs=False,
-        # GPU arguments
+        # GPU and MPI arguments
+        mpi: bool = False,
+        mpi_buf_size: int = 0,
+        sync: bool = False,
     ):
         if not self._CPP_BINARY_AVAILABLE:
             raise ImportError(
@@ -277,6 +381,8 @@ def __init__(  # pylint: disable=too-many-arguments
                 "To manually compile from source, follow the instructions at "
                 "https://docs.pennylane.ai/projects/lightning/en/stable/dev/installation.html."
             )
+            
+        check_gpu_resources()
 
         super().__init__(
             wires=wires,
@@ -290,6 +396,23 @@ def __init__(  # pylint: disable=too-many-arguments
         # GPU specific options
 
         # Creating the state vector
+        
+        self._dp = DevPool()
+        self._c_dtype = c_dtype
+        self._batch_obs = batch_obs
+        self._sync = sync
+        
+        if isinstance(wires, int):
+            self._wire_map = None  # should just use wires as is
+        else:
+            self._wire_map = {w: i for i, w in enumerate(self.wires)}
+
+        self._mpi_handler = LightningGPU_MPIHandler(mpi, mpi_buf_size, self._dp, self.num_wires, c_dtype)
+        
+        self._num_local_wires = self._mpi_handler.num_local_wires
+
+        self._statevector = LightningGPUStateVector(self.num_wires, dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync)
+
 
     @property
     def name(self):
diff --git a/pyproject.toml b/pyproject.toml
index b50ac438d2..1666cc2427 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,14 +1,14 @@
 [build-system]
-requires = [ "cmake~=3.24.0", "ninja; platform_system!='Windows'", "setuptools>=42", "toml",]
+requires = [ "cmake~=3.24.0", "ninja; platform_system!='Windows'", "setuptools>=42", "toml", "custatevec-cu12",]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "PennyLane_Lightning"
+name = "PennyLane_Lightning_GPU"
 description = "PennyLane-Lightning plugin"
 readme = "README.rst"
 requires-python = ">=3.9"
 classifiers = [ "Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Natural Language :: English", "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX", "Operating System :: POSIX :: Linux", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Physics",]
-dependencies = [ "pennylane>=0.37",]
+dependencies = [ "pennylane>=0.37", "pennylane_lightning==0.38.0-dev34",]
 dynamic = [ "version",]
 [[project.maintainers]]
 name = "Xanadu Quantum Technologies Inc."
@@ -25,4 +25,4 @@ kokkos = [ "pennylane-lightning-kokkos",]
 Homepage = "https://github.com/PennyLaneAI/pennylane-lightning"
 
 [project.entry-points."pennylane.plugins"]
-"lightning.qubit" = "pennylane_lightning.lightning_qubit:LightningQubit"
+"lightning.gpu" = "pennylane_lightning.lightning_gpu:LightningGPU"

From bdf4daa1086ef89657b97c7d2f4f4483149e4b92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 9 Sep 2024 09:19:38 -0400
Subject: [PATCH 02/41] add measurements

---
 .../lightning_gpu/_measurements.py            | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 3f4890c55d..14117fbb54 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -15,8 +15,24 @@
 Class implementation for state vector measurements.
 """
 
+try:
+    from pennylane_lightning.lightning_gpu_ops import MeasurementsC64, MeasurementsC128
+    
+    try:
+        from pennylane_lightning.lightning_gpu_ops import MeasurementsMPIC64,MeasurementsMPIC128
+
+        MPI_SUPPORT = True
+    except ImportError:
+        MPI_SUPPORT = False
+
+except ImportError:
+    pass
+
+from typing import List
+
 import numpy as np
 import pennylane as qml
+from pennylane.measurements import CountsMP, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
@@ -37,3 +53,70 @@ def __init__(
     ) -> TensorLike:
 
         super().__init__(lgpu_state)
+
+        self._measurement_lightning = self._measurement_dtype()(lgpu_state.state_vector)
+
+    def _measurement_dtype(self):
+        """Binding to Lightning Kokkos Measurements C++ class.
+
+        Returns: the Measurements class
+        """
+        return MeasurementsC64 if self.dtype == np.complex64 else MeasurementsC128
+
+    def _measure_with_samples_diagonalizing_gates(
+        self,
+        mps: List[SampleMeasurement],
+        shots: Shots,
+    ) -> TensorLike:
+        """
+        Returns the samples of the measurement process performed on the given state,
+        by rotating the state into the measurement basis using the diagonalizing gates
+        given by the measurement process.
+
+        Args:
+            mps (~.measurements.SampleMeasurement): The sample measurements to perform
+            shots (~.measurements.Shots): The number of samples to take
+
+        Returns:
+            TensorLike[Any]: Sample measurement results
+        """
+        # apply diagonalizing gates
+        self._apply_diagonalizing_gates(mps)
+
+        # Specific for Kokkos:
+        total_indices = self._qubit_state.num_wires
+        wires = qml.wires.Wires(range(total_indices))
+
+        def _process_single_shot(samples):
+            processed = []
+            for mp in mps:
+                res = mp.process_samples(samples, wires)
+                if not isinstance(mp, CountsMP):
+                    res = qml.math.squeeze(res)
+
+                processed.append(res)
+
+            return tuple(processed)
+
+        try:
+            samples = self._measurement_lightning.generate_samples(
+                len(wires), shots.total_shots
+            ).astype(int, copy=False)
+
+        except ValueError as e:
+            if str(e) != "probabilities contain NaN":
+                raise e
+            samples = qml.math.full((shots.total_shots, len(wires)), 0)
+
+        self._apply_diagonalizing_gates(mps, adjoint=True)
+
+        # if there is a shot vector, use the shots.bins generator to
+        # split samples w.r.t. the shots
+        processed_samples = []
+        for lower, upper in shots.bins():
+            result = _process_single_shot(samples[..., lower:upper, :])
+            processed_samples.append(result)
+
+        return (
+            tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
+        )

From b05a6a41b2c1bc04b5af5baede89de5c1e349c02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 9 Sep 2024 14:04:06 +0000
Subject: [PATCH 03/41] State vector almos done

---
 .../lightning_gpu/_mpi_handler.py             | 105 +++++++++
 .../lightning_gpu/_state_vector.py            | 213 ++++++++++++------
 .../lightning_gpu/lightning_gpu.py            | 156 ++-----------
 tests/conftest.py                             |   4 +-
 .../test_state_vector_class.py                |  19 +-
 5 files changed, 284 insertions(+), 213 deletions(-)
 create mode 100644 pennylane_lightning/lightning_gpu/_mpi_handler.py

diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
new file mode 100644
index 0000000000..b8e39209c6
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -0,0 +1,105 @@
+
+
+try:
+    # pylint: disable=no-name-in-module
+    from pennylane_lightning.lightning_gpu_ops import (
+        DevTag,
+        MPIManager,
+    )            
+    MPI_SUPPORT = True
+except ImportError:
+    MPI_SUPPORT = False
+
+from typing import Callable, Union
+
+import numpy as np
+
+# MPI options
+class LightningGPU_MPIHandler():
+    """MPI handler for PennyLane Lightning GPU device  
+    
+    MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
+    
+    Use the MPI library is necessary to initialize different variables and methods to handle the data across  nodes and perform checks for memory allocation on each device. 
+    
+    Args:
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        dev_pool (Callable): Method to handle the GPU devices available.
+        num_wires (int): the number of wires to initialize the device wit.h 
+        c_dtype (np.complex64, np.complex128): Datatypes for statevector representation
+        
+    """
+
+    def __init__(self, 
+                 mpi: bool, 
+                 mpi_buf_size: int, 
+                 dev_pool: Callable, 
+                 num_wires: int, 
+                 c_dtype: Union[np.complex64, np.complex128]) -> None:
+        
+        self.use_mpi = mpi
+        self.mpi_but_size = mpi_buf_size
+        self._dp = dev_pool
+        
+        if self.use_mpi: 
+            
+            if not MPI_SUPPORT:
+                raise ImportError("MPI related APIs are not found.")
+
+            if mpi_buf_size < 0:
+                raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
+
+            if (mpi_buf_size > 0 
+                and (mpi_buf_size & (mpi_buf_size - 1))):
+
+                raise ValueError(f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2.")
+            
+            # After check if all MPI parameter are ok
+            self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
+
+            # set the number of global and local wires
+            commSize = self._mpi_manager.getSize()
+            self.num_global_wires = commSize.bit_length() - 1
+            self.num_local_wires = num_wires - self._num_global_wires
+            
+            # Memory size in bytes
+            sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
+            if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
+                raise ValueError("The MPI buffer size is larger than the local state vector size.")
+
+        if not self.use_mpi: 
+            self.num_local_wires = num_wires
+            self.num_global_wires = num_wires
+
+    def _mebibytesToBytes(mebibytes):
+        return mebibytes * 1024 * 1024
+    
+    def _mpi_init_helper(self, num_wires):
+        """Set up MPI checks and initializations."""
+        
+        # initialize MPIManager and config check in the MPIManager ctor
+        mpi_manager = MPIManager()
+        
+        # check if number of GPUs per node is larger than number of processes per node
+        numDevices = self._dp.getTotalDevices()
+        numProcsNode = mpi_manager.getSizeNode()
+        
+        if numDevices < numProcsNode:
+            raise ValueError(
+                "Number of devices should be larger than or equal to the number of processes on each node."
+            )
+        
+        # check if the process number is larger than number of statevector elements
+        if mpi_manager.getSize() > (1 << (num_wires - 1)):
+            raise ValueError(
+                "Number of processes should be smaller than the number of statevector elements."
+            )
+        
+        # set GPU device
+        rank = self._mpi_manager.getRank()
+        deviceid = rank % numProcsNode
+        self._dp.setDeviceID(deviceid)
+        devtag = DevTag(deviceid)
+        
+        return (mpi_manager, devtag)
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 3f70d1705d..1d1d8c775e 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -43,12 +43,23 @@
 from pennylane.wires import Wires
 from pennylane.measurements import MidMeasureMP
 from pennylane.ops import Conditional
+from pennylane import QuantumFunctionError, DeviceError
 
-
+from pennylane_lightning.core._serialize import global_phase_diagonal
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
 
 from ._measurements import LightningGPUMeasurements
-
+from ._mpi_handler import LightningGPU_MPIHandler
+
+gate_cache_needs_hash = (
+    qml.BlockEncode,
+    qml.ControlledQubitUnitary,
+    qml.DiagonalQubitUnitary,
+    qml.MultiControlledX,
+    qml.OrbitalRotation,
+    qml.PSWAP,
+    qml.QubitUnitary,
+)
 
 class LightningGPUStateVector(LightningBaseStateVector):
     """Lightning GPU state-vector class.
@@ -73,18 +84,19 @@ def __init__(self, num_wires, dtype=np.complex128, device_name="lightning.gpu",
         super().__init__(num_wires, dtype)
 
         self._device_name = device_name
+        
+        if mpi_handler is None:
+            mpi_handler = LightningGPU_MPIHandler(False, 0, None, num_wires, dtype)
 
-        self._num_global_wires = self._mpi_handler.num_global_wires
-        self._num_local_wires = self._mpi_handler.num_local_wires
+        self._num_global_wires = mpi_handler.num_global_wires
+        self._num_local_wires = mpi_handler.num_local_wires
 
-        self._dtype = dtype
         self._mpi_handler = mpi_handler
         self._sync = sync
 
-        self._wires = Wires(range(num_wires))
-        
+        # Initialize the state vector
         if self._mpi_handler.use_mpi:
-            self._lgpu_state = self._state_dtype()(
+            self._qubit_state = self._state_dtype()(
                 self._mpi_handler.mpi_manager,
                 self._mpi_handler.devtag,
                 self._mpi_handler.mpi_buf_size,
@@ -93,49 +105,9 @@ def __init__(self, num_wires, dtype=np.complex128, device_name="lightning.gpu",
             )
 
         if not self._mpi_handler.use_mpi:
-            self._lgpu_state = self._state_dtype()(self.num_wires)
+            self._qubit_state = self._state_dtype()(self.num_wires)
             
-    @property
-    def dtype(self):
-        """Returns the state vector data type."""
-        return self._dtype
-
-    @property
-    def device_name(self):
-        """Returns the state vector device name."""
-        return self._device_name
-
-    @property
-    def wires(self):
-        """All wires that can be addressed on this device"""
-        return self._wires
-
-    @property
-    def num_wires(self):
-        """Number of wires addressed on this device"""
-        return self._num_wires
-    
-    @property
-    def state_vector(self):
-        """Returns a handle to the state vector."""
-        return self._lgpu_state
-
-    @property
-    def state(self):
-        """Copy the state vector data from the device to the host.
-
-        A state vector Numpy array is explicitly allocated on the host to store and return the data.
-
-        **Example**
-
-        >>> dev = qml.device('lightning.gpu', wires=1)
-        >>> dev.apply([qml.PauliX(wires=[0])])
-        >>> print(dev.state)
-        [0.+0.j 1.+0.j]
-        """
-        state = np.zeros(1 << self._num_local_wires, dtype=self.dtype)
-        self.syncD2H(state)
-        return state
+        self._create_basis_state(0)
 
     def _state_dtype(self):
         """Binding to Lightning Managed state vector C++ class.
@@ -150,9 +122,7 @@ def _state_dtype(self):
     def reset_state(self):
         """Reset the device's state"""
         # init the state vector to |00..0>
-        self._gpu_state.resetGPU(False)  # Sync reset
-
-        self._lgpu_state.resetStateVector()
+        self._qubit_state.resetGPU(False)  # Sync reset
 
     def syncD2H(self, state_vector, use_async=False):
         """Copy the state vector data on device to a state vector on the host provided by the user
@@ -169,7 +139,26 @@ def syncD2H(self, state_vector, use_async=False):
         >>> print(state_vector)
         [0.+0.j 1.+0.j]
         """
-        self._lgpu_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
+        self._qubit_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
+            
+
+    @property
+    def state(self):
+        """Copy the state vector data from the device to the host.
+
+        A state vector Numpy array is explicitly allocated on the host to store and return the data.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> print(dev.state)
+        [0.+0.j 1.+0.j]
+        """
+        state = np.zeros(1 << self._num_local_wires, dtype=self.dtype)
+        self.syncD2H(state)
+        return state
+
 
     def syncH2D(self, state_vector, use_async=False):
         """Copy the state vector data on host provided by the user to the state vector on the device
@@ -190,7 +179,28 @@ def syncH2D(self, state_vector, use_async=False):
         >>> print(res)
         1.0
         """
-        self._lgpu_state.HostToDevice(state_vector.ravel(order="C"), use_async)
+        self._qubit_state.HostToDevice(state_vector.ravel(order="C"), use_async)
+    
+    @staticmethod
+    def _asarray(arr, dtype=None):
+        arr = np.asarray(arr)  # arr is not copied
+
+        if arr.dtype.kind not in ["f", "c"]:
+            return arr
+
+        if not dtype:
+            dtype = arr.dtype
+
+        return arr
+
+    def _create_basis_state(self, index, use_async=False):
+        """Return a computational basis state over all wires.
+        Args:
+            index (int): integer representing the computational basis state.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+        """
+        self._qubit_state.setBasisState(index, use_async)
 
     def _apply_state_vector(self, state, device_wires, use_async=False):
         """Initialize the state vector on GPU with a specified state on host.
@@ -202,16 +212,21 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
         use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
         Note: This function only supports synchronized memory copy from host to device.
         """
-        # translate to wire labels used by device
-        # device_wires = self.map_wires(device_wires)
+        
+        if isinstance(state, self._qubit_state.__class__):
+            raise DeviceError("LightningGPU does not support allocate external state_vector.")
+            
+            state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
+            state.getState(state_data)
+            state = state_data
 
-        # state = self._asarray(state, dtype=self.C_DTYPE)  # this operation on host
+        state = self._asarray(state, dtype=self.dtype)  # this operation on host
         output_shape = [2] * self._num_local_wires
 
         if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
             # Initialize the entire device state with the input state
             if self.num_wires == self._num_local_wires:
-                self.syncH2D(self._reshape(state, output_shape))
+                self.syncH2D(np.reshape(state, output_shape))
                 return
             local_state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
             self._mpi_handler.mpi_manager.Scatter(state, local_state, 0)
@@ -231,7 +246,7 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
         ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires)
 
         # set the state vector on GPU with the unravelled_indices and their corresponding values
-        self._lgpu_state.setStateVector(
+        self._qubit_state.setStateVector(
             ravelled_indices, state, use_async
         )  # this operation on device
 
@@ -243,24 +258,60 @@ def _apply_basis_state(self, state, wires):
             wires (Wires): wires that the provided computational state should be initialized on
         Note: This function does not support broadcasted inputs yet.
         """
-        # # translate to wire labels used by device
-        # device_wires = self.map_wires(wires)
+        # translate to wire labels used by device
+        device_wires = wires
 
-        # length of basis state parameter
         if not set(state.tolist()).issubset({0, 1}):
             raise ValueError("BasisState parameter must consist of 0 or 1 integers.")
 
         if len(state) != len(wires):
             raise ValueError("BasisState parameter and wires must be of equal length.")
 
-        self._lgpu_state.setStateVector(list(state), list(wires))
-        # # get computational basis state number
-        # basis_states = 2 ** (self.num_wires - 1 - np.array(list(wires)))
-        # basis_states = qml.math.convert_like(basis_states, state)
-        # num = int(qml.math.dot(state, basis_states))
+        # get computational basis state number
+        basis_states = 1 << (self.num_wires - 1 - np.array(device_wires))
+        basis_states = qml.math.convert_like(basis_states, state)
+        num = int(qml.math.dot(state, basis_states))
 
-        # self._create_basis_state(num)
+        self._create_basis_state(num)
 
+    def _apply_lightning_controlled(self, operation):
+        """Apply an arbitrary controlled operation to the state tensor.
+
+        Args:
+            operation (~pennylane.operation.Operation): controlled operation to apply
+
+        Returns:
+            None
+        """
+        state = self.state_vector
+
+        control_wires = list(operation.control_wires)
+        control_values = operation.control_values
+        name = operation.name
+        # Apply GlobalPhase
+        inv = False
+        param = operation.parameters[0]
+        wires = self.wires.indices(operation.wires)
+        matrix = global_phase_diagonal(param, self.wires, control_wires, control_values)
+        state.apply(name, wires, inv, [[param]], matrix)
+        
+    def _apply_lightning_midmeasure(
+        self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str
+    ):
+        """Execute a MidMeasureMP operation and return the sample in mid_measurements.
+
+        Args:
+            operation (~pennylane.operation.Operation): mid-circuit measurement
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots.
+
+        Returns:
+            None
+        """
+        raise DeviceError("LightningGPU does not support Mid Circuit Measure.")
+    
     def _apply_lightning(
         self, operations, mid_measurements: dict = None, postselect_mode: str = None
     ):
@@ -302,14 +353,26 @@ def _apply_lightning(
             elif method is not None:  # apply specialized gate
                 param = operation.parameters
                 method(wires, invert_param, param)
-            elif isinstance(operation, qml.ops.Controlled):  # apply n-controlled gate
+            elif isinstance(operation, qml.ops.Controlled) and isinstance(
+                operation.base, qml.GlobalPhase
+            ):  # apply n-controlled gate
+                # LGPU do not support the controlled gates except for GlobalPhase
                 self._apply_lightning_controlled(operation)
             else:  # apply gate as a matrix
-                # Inverse can be set to False since qml.matrix(operation) is already in
-                # inverted form
-                method = getattr(state, "applyMatrix")
                 try:
-                    method(qml.matrix(operation), wires, False)
+                    mat = qml.matrix(operation)
                 except AttributeError:  # pragma: no cover
                     # To support older versions of PL
-                    method(operation.matrix, wires, False)
+                    mat = operation.matrix
+                r_dtype = np.float32 if self.dtype == np.complex64 else np.float64
+                param = [[r_dtype(operation.hash)]] if isinstance(operation, gate_cache_needs_hash) else []
+                if len(mat) == 0:
+                    raise ValueError("Unsupported operation")
+                self._qubit_state.apply(
+                    name,
+                    wires,
+                    False,
+                    param,
+                    mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
+                )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
+
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 3f2c36a75b..b1725d6781 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -19,9 +19,8 @@
 
 from ctypes.util import find_library
 from importlib import util as imp_util
-from numbers import Number
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Callable, Union
 
 import numpy as np
 import pennylane as qml
@@ -43,7 +42,6 @@
 from ._state_vector import LightningGPUStateVector
 
 try:
-
     from pennylane_lightning.lightning_gpu_ops import (
         DevPool,
         backend_info,
@@ -58,7 +56,8 @@
         from pennylane_lightning.lightning_gpu_ops import (
             DevTag,
             MPIManager,
-        )            
+        )
+        from ._mpi_handler import LightningGPU_MPIHandler            
         MPI_SUPPORT = True
     except ImportError:
         MPI_SUPPORT = False
@@ -150,122 +149,6 @@
     }
 )
 
-gate_cache_needs_hash = (
-    qml.BlockEncode,
-    qml.ControlledQubitUnitary,
-    qml.DiagonalQubitUnitary,
-    qml.MultiControlledX,
-    qml.OrbitalRotation,
-    qml.PSWAP,
-    qml.QubitUnitary,
-)
-
-# MPI options
-class LightningGPU_MPIHandler():
-    """MPI handler for PennyLane Lightning GPU device  
-    
-    MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
-    
-    Use the MPI library is necessary to initialize different variables and methods to handle the data across  nodes and perform checks for memory allocation on each device. 
-    
-    Args:
-        mpi (bool): declare if the device will use the MPI support.
-        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
-        dev_pool (Callable): Method to handle the GPU devices available.
-        num_wires (int): the number of wires to initialize the device wit.h 
-        c_dtype (np.complex64, np.complex128): Datatypes for statevector representation
-        
-    """
-
-    def __init__(self, 
-                 mpi: bool, 
-                 mpi_buf_size: int, 
-                 dev_pool: Callable, 
-                 num_wires: int, 
-                 c_dtype: Union[np.complex64, np.complex128]) -> None:
-        
-        self.use_mpi = mpi
-        self.mpi_but_size = mpi_buf_size
-        self._dp = dev_pool
-        
-        if self.use_mpi: 
-            
-            if not MPI_SUPPORT:
-                raise ImportError("MPI related APIs are not found.")
-
-            if mpi_buf_size < 0:
-                raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
-
-            if (mpi_buf_size > 0 
-                and (mpi_buf_size & (mpi_buf_size - 1))):
-
-                raise ValueError(f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2.")
-            
-            # After check if all MPI parameter are ok
-            self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
-
-            # set the number of global and local wires
-            commSize = self._mpi_manager.getSize()
-            self.num_global_wires = commSize.bit_length() - 1
-            self.num_local_wires = num_wires - self._num_global_wires
-            
-            # Memory size in bytes
-            sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
-            if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
-                raise ValueError("The MPI buffer size is larger than the local state vector size.")
-
-        if not self.use_mpi: 
-            self.num_local_wires = num_wires
-
-    def _mebibytesToBytes(mebibytes):
-        return mebibytes * 1024 * 1024
-    
-    def _mpi_init_helper(self, num_wires):
-        """Set up MPI checks and initializations."""
-        
-        # initialize MPIManager and config check in the MPIManager ctor
-        mpi_manager = MPIManager()
-        
-        # check if number of GPUs per node is larger than number of processes per node
-        numDevices = self._dp.getTotalDevices()
-        numProcsNode = mpi_manager.getSizeNode()
-        
-        if numDevices < numProcsNode:
-            raise ValueError(
-                "Number of devices should be larger than or equal to the number of processes on each node."
-            )
-        
-        # check if the process number is larger than number of statevector elements
-        if mpi_manager.getSize() > (1 << (num_wires - 1)):
-            raise ValueError(
-                "Number of processes should be smaller than the number of statevector elements."
-            )
-        
-        # set GPU device
-        rank = self._mpi_manager.getRank()
-        deviceid = rank % numProcsNode
-        self._dp.setDeviceID(deviceid)
-        devtag = DevTag(deviceid)
-        
-        return (mpi_manager, devtag)
-
-
-def check_gpu_resources() -> None:
-    """ Check the available resources of each Nvidia GPU """
-    if (find_library("custatevec") is None 
-        and not imp_util.find_spec("cuquantum")):
-        
-        raise ImportError(
-            "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
-        )
-        
-    if not DevPool.getTotalDevices():
-        raise ValueError("No supported CUDA-capable device found")
-
-    if not is_gpu_supported():
-        raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
-    
-
 def stopping_condition(op: Operator) -> bool:
     """A function that determines whether or not an operation is supported by ``lightning.gpu``."""
     # To avoid building matrices beyond the given thresholds.
@@ -319,6 +202,21 @@ def _add_adjoint_transforms(program: TransformProgram) -> None:
     name = "adjoint + lightning.gpu"
     return 0
 
+def check_gpu_resources() -> None:
+    """ Check the available resources of each Nvidia GPU """
+    if (find_library("custatevec") is None 
+        and not imp_util.find_spec("cuquantum")):
+        
+        raise ImportError(
+            "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
+        )
+        
+    if not DevPool.getTotalDevices():
+        raise ValueError("No supported CUDA-capable device found")
+
+    if not is_gpu_supported():
+        raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
+
 
 @simulator_tracking
 @single_tape_support
@@ -392,26 +290,20 @@ def __init__(  # pylint: disable=too-many-arguments
         )
 
         # Set the attributes to call the LightningGPU classes
+        self._set_Lightning_classes()
 
         # GPU specific options
-
-        # Creating the state vector
-        
         self._dp = DevPool()
-        self._c_dtype = c_dtype
-        self._batch_obs = batch_obs
         self._sync = sync
-        
-        if isinstance(wires, int):
-            self._wire_map = None  # should just use wires as is
-        else:
-            self._wire_map = {w: i for i, w in enumerate(self.wires)}
 
+        # Creating the state vector
+        
         self._mpi_handler = LightningGPU_MPIHandler(mpi, mpi_buf_size, self._dp, self.num_wires, c_dtype)
         
         self._num_local_wires = self._mpi_handler.num_local_wires
 
-        self._statevector = LightningGPUStateVector(self.num_wires, dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync)
+        print("FSDX")
+        self._statevector = self.LightningStateVector(self.num_wires, dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync)
 
 
     @property
@@ -421,7 +313,7 @@ def name(self):
 
     def _set_Lightning_classes(self):
         """Load the LightningStateVector, LightningMeasurements, LightningAdjointJacobian as class attribute"""
-        return 0
+        self.LightningStateVector = LightningGPUStateVector
 
     def _setup_execution_config(self, config):
         """
diff --git a/tests/conftest.py b/tests/conftest.py
index b5ddf416ce..1b3d97a208 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -142,10 +142,12 @@ def get_device():
         from pennylane_lightning.lightning_kokkos_ops import LightningException
 elif device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
+    from pennylane_lightning.lightning_gpu._state_vector import (
+        LightningGPUStateVector as LightningStateVector,
+    )
 
     LightningAdjointJacobian = None
     LightningMeasurements = None
-    LightningStateVector = None
 
     if hasattr(pennylane_lightning, "lightning_gpu_ops"):
         import pennylane_lightning.lightning_gpu_ops as lightning_ops
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index bb24882074..11b4557144 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -29,6 +29,9 @@
         from pennylane_lightning.lightning_kokkos_ops import InitializationSettings
     except ImportError:
         pass
+    
+if device_name == "lightning.gpu":
+    from pennylane_lightning.lightning_gpu._mpi_handler import LightningGPU_MPIHandler
 
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
@@ -39,8 +42,8 @@
         allow_module_level=True,
     )
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
+# if device_name == "lightning.gpu":
+#     pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
 
 if not LightningDevice._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
@@ -99,10 +102,16 @@ def test_apply_state_vector_with_lightning_handle(tol):
     state_vector_1 = LightningStateVector(2)
     state_vector_1.apply_operations([qml.BasisState(np.array([0, 1]), wires=[0, 1])])
 
-    state_vector_2 = LightningStateVector(2)
-    state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
+    if device_name == 'lightning.gpu':
+        with pytest.raises(qml.DeviceError, match="LightningGPU does not support allocate external state_vector."):
+            state_vector_2 = LightningStateVector(2)
+            state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
+
+    else:
+        state_vector_2 = LightningStateVector(2)
+        state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
 
-    assert np.allclose(state_vector_1.state, state_vector_2.state, atol=tol, rtol=0)
+        assert np.allclose(state_vector_1.state, state_vector_2.state, atol=tol, rtol=0)
 
 
 @pytest.mark.parametrize(

From 5dac907dd66c236e77886d8c6732baebf586bdb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 9 Sep 2024 19:00:10 +0000
Subject: [PATCH 04/41] tmp commit

---
 .../lightning_gpu/_measurements.py            |   2 +-
 .../lightning_gpu/lightning_gpu.py            |   1 +
 tests/conftest.py                             |   4 +-
 .../test_measurements_class.py                | 292 +++++++++---------
 4 files changed, 153 insertions(+), 146 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 14117fbb54..ac7598ed07 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -83,7 +83,7 @@ def _measure_with_samples_diagonalizing_gates(
         # apply diagonalizing gates
         self._apply_diagonalizing_gates(mps)
 
-        # Specific for Kokkos:
+        # Specific for LGPU:
         total_indices = self._qubit_state.num_wires
         wires = qml.wires.Wires(range(total_indices))
 
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index b1725d6781..a06c47f869 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -37,6 +37,7 @@
     Result_or_ResultBatch,
 )
 
+from ._mpi_handler import LightningGPU_MPIHandler
 from ._adjoint_jacobian import LightningGPUAdjointJacobian
 from ._measurements import LightningGPUMeasurements
 from ._state_vector import LightningGPUStateVector
diff --git a/tests/conftest.py b/tests/conftest.py
index 1b3d97a208..4cc0e4c5c0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -145,9 +145,11 @@ def get_device():
     from pennylane_lightning.lightning_gpu._state_vector import (
         LightningGPUStateVector as LightningStateVector,
     )
+    from pennylane_lightning.lightning_gpu._measurements import (
+        LightningGPUMeasurements as LightningMeasurements,
+    )
 
     LightningAdjointJacobian = None
-    LightningMeasurements = None
 
     if hasattr(pennylane_lightning, "lightning_gpu_ops"):
         import pennylane_lightning.lightning_gpu_ops as lightning_ops
diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index 61af2dab3a..d82dc171b8 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -38,9 +38,6 @@
         allow_module_level=True,
     )
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
-
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 
@@ -411,15 +408,16 @@ def calculate_reference(tape, lightning_sv):
         m = LightningMeasurements(statevector)
         return m.measure_final_state(tape)
 
-    @flaky(max_runs=15)
-    @pytest.mark.parametrize("shots", [None, 200_000, [190_000, 190_000]])
+    @flaky(max_runs=2)
+    # @pytest.mark.parametrize("shots", [None, 200_000, [190_000, 190_000]])
+    @pytest.mark.parametrize("shots", [None, 1_000_000])
     @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
     @pytest.mark.parametrize(
         "observable",
         (
-            [0],
-            [1, 2],
-            [1, 0],
+            # [0],
+            # [1, 2],
+            # [1, 0],
             qml.PauliX(0),
             qml.PauliY(1),
             qml.PauliZ(2),
@@ -452,6 +450,12 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
             pytest.skip(
                 f"Measurement of type {type(measurement).__name__} does not have a keyword argument 'wires'."
             )
+            
+        print()
+        print("shots:",shots)
+        print("measurement:",measurement)
+        print("observable:", observable)
+            
         rtol = 1.0e-2  # 1% of expected value as tolerance
         if shots != None and measurement is qml.expval:
             # Increase the number of shots
@@ -508,142 +512,142 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
             # allclose -> absolute(a - b) <= (atol + rtol * absolute(b))
             assert np.allclose(result, expected, rtol=rtol, atol=atol)
 
-    @flaky(max_runs=10)
-    @pytest.mark.parametrize("shots", [None, 100_000, (90_000, 90_000)])
-    @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
-    @pytest.mark.parametrize(
-        "obs0_",
-        (
-            qml.PauliX(0),
-            qml.PauliY(1),
-            qml.PauliZ(2),
-            qml.sum(qml.PauliX(0), qml.PauliY(0)),
-            qml.prod(qml.PauliX(0), qml.PauliY(1)),
-            qml.s_prod(2.0, qml.PauliX(0)),
-            qml.Hermitian(get_hermitian_matrix(2), wires=[0]),
-            qml.Hermitian(get_hermitian_matrix(2**2), wires=[2, 3]),
-            qml.Hamiltonian(
-                [1.0, 2.0, 3.0], [qml.PauliX(0), qml.PauliY(1), qml.PauliZ(2) @ qml.PauliZ(3)]
-            ),
-            qml.SparseHamiltonian(get_sparse_hermitian_matrix(2**4), wires=range(4)),
-        ),
-    )
-    @pytest.mark.parametrize(
-        "obs1_",
-        (
-            qml.PauliX(0),
-            qml.PauliY(1),
-            qml.PauliZ(2),
-            qml.sum(qml.PauliX(0), qml.PauliY(0)),
-            qml.prod(qml.PauliX(0), qml.PauliY(1)),
-            qml.s_prod(2.0, qml.PauliX(0)),
-            qml.Hermitian(get_hermitian_matrix(2), wires=[0]),
-            qml.Hermitian(get_hermitian_matrix(2**2), wires=[2, 3]),
-            qml.Hamiltonian(
-                [1.0, 2.0, 3.0], [qml.PauliX(0), qml.PauliY(1), qml.PauliZ(2) @ qml.PauliZ(3)]
-            ),
-            qml.SparseHamiltonian(get_sparse_hermitian_matrix(2**4), wires=range(4)),
-        ),
-    )
-    def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_sv, tol):
-        skip_list = (
-            qml.ops.Sum,
-            qml.ops.SProd,
-            qml.ops.Prod,
-            qml.Hamiltonian,
-            qml.SparseHamiltonian,
-        )
-        if measurement is qml.probs and (
-            isinstance(obs0_, skip_list) or isinstance(obs1_, skip_list)
-        ):
-            pytest.skip(
-                f"Observable of type {type(obs0_).__name__} is not supported for rotating probabilities."
-            )
-
-        rtol = 1.0e-2  # 1% of expected value as tolerance
-        if shots != None and measurement is qml.expval:
-            # Increase the number of shots
-            if isinstance(shots, int):
-                shots *= 10
-            else:
-                shots = [i * 10 for i in shots]
-
-            # Extra tolerance
-            rtol = 5.0e-2  # 5% of expected value as tolerance
-
-        n_qubits = 4
-        n_layers = 1
-        np.random.seed(0)
-        weights = np.random.rand(n_layers, n_qubits, 3)
-        ops = [qml.Hadamard(i) for i in range(n_qubits)]
-        ops += [qml.StronglyEntanglingLayers(weights, wires=range(n_qubits))]
-        measurements = [measurement(op=obs0_), measurement(op=obs1_)]
-        tape = qml.tape.QuantumScript(ops, measurements, shots=shots)
-
-        statevector = lightning_sv(n_qubits)
-        statevector = statevector.get_final_state(tape)
-        m = LightningMeasurements(statevector)
-
-        skip_list = (
-            qml.ops.Sum,
-            qml.Hamiltonian,
-            qml.SparseHamiltonian,
-        )
-        do_skip = measurement is qml.var and (
-            isinstance(obs0_, skip_list) or isinstance(obs1_, skip_list)
-        )
-        do_skip = do_skip or (
-            measurement is qml.expval
-            and (
-                isinstance(obs0_, qml.SparseHamiltonian) or isinstance(obs1_, qml.SparseHamiltonian)
-            )
-        )
-        do_skip = do_skip and shots is not None
-        if do_skip:
-            with pytest.raises(TypeError):
-                _ = m.measure_final_state(tape)
-            return
-        else:
-            result = m.measure_final_state(tape)
-
-        expected = self.calculate_reference(tape, lightning_sv)
-        if len(expected) == 1:
-            expected = expected[0]
-
-        assert isinstance(result, Sequence)
-        assert len(result) == len(expected)
-        # a few tests may fail in single precision, and hence we increase the tolerance
-        atol = tol if shots is None else max(tol, 1.0e-2)
-        rtol = max(tol, rtol)  # % of expected value as tolerance
-        for r, e in zip(result, expected):
-            if isinstance(shots, tuple) and isinstance(r[0], np.ndarray):
-                r = np.concatenate(r)
-                e = np.concatenate(e)
-            # allclose -> absolute(r - e) <= (atol + rtol * absolute(e))
-            assert np.allclose(r, e, atol=atol, rtol=rtol)
-
-    @pytest.mark.parametrize(
-        "cases",
-        [
-            [[0, 1], [1, 0]],
-            [[1, 0], [0, 1]],
-        ],
-    )
-    def test_probs_tape_unordered_wires(self, cases, tol):
-        """Test probs with a circuit on wires=[0] fails for out-of-order wires passed to probs."""
-
-        x, y, z = [0.5, 0.3, -0.7]
-        dev = qml.device(device_name, wires=cases[1])
-
-        def circuit():
-            qml.RX(0.4, wires=[0])
-            qml.Rot(x, y, z, wires=[0])
-            qml.RY(-0.2, wires=[0])
-            return qml.probs(wires=cases[0])
-
-        expected = qml.QNode(circuit, qml.device("default.qubit", wires=cases[1]))()
-        results = qml.QNode(circuit, dev)()
-        assert np.allclose(expected, results, tol)
+    # @flaky(max_runs=10)
+    # @pytest.mark.parametrize("shots", [None, 100_000, (90_000, 90_000)])
+    # @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
+    # @pytest.mark.parametrize(
+    #     "obs0_",
+    #     (
+    #         qml.PauliX(0),
+    #         qml.PauliY(1),
+    #         qml.PauliZ(2),
+    #         qml.sum(qml.PauliX(0), qml.PauliY(0)),
+    #         qml.prod(qml.PauliX(0), qml.PauliY(1)),
+    #         qml.s_prod(2.0, qml.PauliX(0)),
+    #         qml.Hermitian(get_hermitian_matrix(2), wires=[0]),
+    #         qml.Hermitian(get_hermitian_matrix(2**2), wires=[2, 3]),
+    #         qml.Hamiltonian(
+    #             [1.0, 2.0, 3.0], [qml.PauliX(0), qml.PauliY(1), qml.PauliZ(2) @ qml.PauliZ(3)]
+    #         ),
+    #         qml.SparseHamiltonian(get_sparse_hermitian_matrix(2**4), wires=range(4)),
+    #     ),
+    # )
+    # @pytest.mark.parametrize(
+    #     "obs1_",
+    #     (
+    #         qml.PauliX(0),
+    #         qml.PauliY(1),
+    #         qml.PauliZ(2),
+    #         qml.sum(qml.PauliX(0), qml.PauliY(0)),
+    #         qml.prod(qml.PauliX(0), qml.PauliY(1)),
+    #         qml.s_prod(2.0, qml.PauliX(0)),
+    #         qml.Hermitian(get_hermitian_matrix(2), wires=[0]),
+    #         qml.Hermitian(get_hermitian_matrix(2**2), wires=[2, 3]),
+    #         qml.Hamiltonian(
+    #             [1.0, 2.0, 3.0], [qml.PauliX(0), qml.PauliY(1), qml.PauliZ(2) @ qml.PauliZ(3)]
+    #         ),
+    #         qml.SparseHamiltonian(get_sparse_hermitian_matrix(2**4), wires=range(4)),
+    #     ),
+    # )
+    # def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_sv, tol):
+    #     skip_list = (
+    #         qml.ops.Sum,
+    #         qml.ops.SProd,
+    #         qml.ops.Prod,
+    #         qml.Hamiltonian,
+    #         qml.SparseHamiltonian,
+    #     )
+    #     if measurement is qml.probs and (
+    #         isinstance(obs0_, skip_list) or isinstance(obs1_, skip_list)
+    #     ):
+    #         pytest.skip(
+    #             f"Observable of type {type(obs0_).__name__} is not supported for rotating probabilities."
+    #         )
+
+    #     rtol = 1.0e-2  # 1% of expected value as tolerance
+    #     if shots != None and measurement is qml.expval:
+    #         # Increase the number of shots
+    #         if isinstance(shots, int):
+    #             shots *= 10
+    #         else:
+    #             shots = [i * 10 for i in shots]
+
+    #         # Extra tolerance
+    #         rtol = 5.0e-2  # 5% of expected value as tolerance
+
+    #     n_qubits = 4
+    #     n_layers = 1
+    #     np.random.seed(0)
+    #     weights = np.random.rand(n_layers, n_qubits, 3)
+    #     ops = [qml.Hadamard(i) for i in range(n_qubits)]
+    #     ops += [qml.StronglyEntanglingLayers(weights, wires=range(n_qubits))]
+    #     measurements = [measurement(op=obs0_), measurement(op=obs1_)]
+    #     tape = qml.tape.QuantumScript(ops, measurements, shots=shots)
+
+    #     statevector = lightning_sv(n_qubits)
+    #     statevector = statevector.get_final_state(tape)
+    #     m = LightningMeasurements(statevector)
+
+    #     skip_list = (
+    #         qml.ops.Sum,
+    #         qml.Hamiltonian,
+    #         qml.SparseHamiltonian,
+    #     )
+    #     do_skip = measurement is qml.var and (
+    #         isinstance(obs0_, skip_list) or isinstance(obs1_, skip_list)
+    #     )
+    #     do_skip = do_skip or (
+    #         measurement is qml.expval
+    #         and (
+    #             isinstance(obs0_, qml.SparseHamiltonian) or isinstance(obs1_, qml.SparseHamiltonian)
+    #         )
+    #     )
+    #     do_skip = do_skip and shots is not None
+    #     if do_skip:
+    #         with pytest.raises(TypeError):
+    #             _ = m.measure_final_state(tape)
+    #         return
+    #     else:
+    #         result = m.measure_final_state(tape)
+
+    #     expected = self.calculate_reference(tape, lightning_sv)
+    #     if len(expected) == 1:
+    #         expected = expected[0]
+
+    #     assert isinstance(result, Sequence)
+    #     assert len(result) == len(expected)
+    #     # a few tests may fail in single precision, and hence we increase the tolerance
+    #     atol = tol if shots is None else max(tol, 1.0e-2)
+    #     rtol = max(tol, rtol)  # % of expected value as tolerance
+    #     for r, e in zip(result, expected):
+    #         if isinstance(shots, tuple) and isinstance(r[0], np.ndarray):
+    #             r = np.concatenate(r)
+    #             e = np.concatenate(e)
+    #         # allclose -> absolute(r - e) <= (atol + rtol * absolute(e))
+    #         assert np.allclose(r, e, atol=atol, rtol=rtol)
+
+    # @pytest.mark.parametrize(
+    #     "cases",
+    #     [
+    #         [[0, 1], [1, 0]],
+    #         [[1, 0], [0, 1]],
+    #     ],
+    # )
+    # def test_probs_tape_unordered_wires(self, cases, tol):
+    #     """Test probs with a circuit on wires=[0] fails for out-of-order wires passed to probs."""
+
+    #     x, y, z = [0.5, 0.3, -0.7]
+    #     dev = qml.device(device_name, wires=cases[1])
+
+    #     def circuit():
+    #         qml.RX(0.4, wires=[0])
+    #         qml.Rot(x, y, z, wires=[0])
+    #         qml.RY(-0.2, wires=[0])
+    #         return qml.probs(wires=cases[0])
+
+    #     expected = qml.QNode(circuit, qml.device("default.qubit", wires=cases[1]))()
+    #     results = qml.QNode(circuit, dev)()
+    #     assert np.allclose(expected, results, tol)
 
 
 class TestControlledOps:

From bfd0771850330e453cb9412444e33d2624c1e544 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 9 Sep 2024 22:54:20 +0000
Subject: [PATCH 05/41] Solve prop issue

---
 .../core/_measurements_base.py                |  20 ++
 .../lightning_gpu/_state_vector.py            |  37 +++
 .../lightning_qubit/_state_vector.py          |  22 +-
 pyproject.toml                                |   2 +-
 tests/conftest.py                             |   3 +-
 .../test_measurements_class.py                | 239 +++++++++---------
 tests/test_measurements.py                    |   4 +-
 7 files changed, 202 insertions(+), 125 deletions(-)

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index fbd3d023d8..dabec30eda 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -140,14 +140,34 @@ def probs(self, measurementprocess: MeasurementProcess):
             Probabilities of the supplied observable or wires
         """
         diagonalizing_gates = measurementprocess.diagonalizing_gates()
+        # print('*'*100)
+        # print("probs: diagonalizing_gates:", diagonalizing_gates)
         if diagonalizing_gates:
             self._qubit_state.apply_operations(diagonalizing_gates)
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+        
+        # print("probs: result:",results)
+        # print('*'*100)
         if diagonalizing_gates:
             self._qubit_state.apply_operations(
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
+            
+        if len(results) > 0:
+            num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
+            return results.reshape([2] * num_local_wires).transpose().reshape(-1)
+
         return results
+    
+        # translate to wire labels used by device
+        observable_wires = self.map_wires(wires)
+        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
+        local_prob = self.measurements.probs(observable_wires)
+        if len(local_prob) > 0:
+            num_local_wires = len(local_prob).bit_length() - 1 if len(local_prob) > 0 else 0
+            return local_prob.reshape([2] * num_local_wires).transpose().reshape(-1)
+        return local_prob
+
 
     def var(self, measurementprocess: MeasurementProcess):
         """Variance of the supplied observable contained in the MeasurementProcess.
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 1d1d8c775e..18a0f45fb2 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -342,6 +342,11 @@ def _apply_lightning(
                 invert_param = False
             method = getattr(state, name, None)
             wires = list(operation.wires)
+            
+            # print("statevector: _apply_lightning:  state:",state.__dir__)
+            # print("statevector: _apply_lightning:    ops:",operation)
+            # print("statevector: _apply_lightning:   name:",name)
+            # print("statevector: _apply_lightning: method:",method)
 
             if isinstance(operation, Conditional):
                 if operation.meas_val.concretize(mid_measurements):
@@ -364,10 +369,17 @@ def _apply_lightning(
                 except AttributeError:  # pragma: no cover
                     # To support older versions of PL
                     mat = operation.matrix
+                    
+                
                 r_dtype = np.float32 if self.dtype == np.complex64 else np.float64
                 param = [[r_dtype(operation.hash)]] if isinstance(operation, gate_cache_needs_hash) else []
+                # param = []
                 if len(mat) == 0:
                     raise ValueError("Unsupported operation")
+
+                # print("statevector: _apply_lightning: method:",method)
+                # print("statevector: _apply_lightning: mat:", mat)
+
                 self._qubit_state.apply(
                     name,
                     wires,
@@ -375,4 +387,29 @@ def _apply_lightning(
                     param,
                     mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
                 )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
+                
+                # ----------------------------------------------------------
+                # method = getattr(state, "applyMatrix")
+                # # print("statevector: _apply_lightning: method:",method)
+                # # print("statevector: _apply_lightning: matrix:",qml.matrix(operation))
+                # # print("statevector: _apply_lightning: matrix:",operation.matrix)
+                
+                # try:
+                #     mat = qml.matrix(operation)
+                # except AttributeError:  # pragma: no cover
+                #     # To support older versions of PL
+                #     mat = operation.matrix
+
+                # # mat = mat.ravel(order='C')
+                # # mat = mat.conjugate().transpose()
+                
+                # print("statevector: _apply_lightning: mat:", mat)
+                # method(mat.ravel(order="C"), wires, False)
+                
+                # # try:
+                # #     method(mat, wires, False)
+                # # except AttributeError:  # pragma: no cover
+                # #     # To support older versions of PL
+                # #     method(mat, wires, False)
+
 
diff --git a/pennylane_lightning/lightning_qubit/_state_vector.py b/pennylane_lightning/lightning_qubit/_state_vector.py
index 4e47b2fb66..08a0c107fa 100644
--- a/pennylane_lightning/lightning_qubit/_state_vector.py
+++ b/pennylane_lightning/lightning_qubit/_state_vector.py
@@ -192,9 +192,15 @@ def _apply_lightning(
             else:
                 name = operation.name
                 invert_param = False
+
             method = getattr(state, name, None)
             wires = list(operation.wires)
 
+            print("statevector: _apply_lightning:  state:",state.__dir__)
+            print("statevector: _apply_lightning:    ops:",operation)
+            print("statevector: _apply_lightning:   name:",name)
+            print("statevector: _apply_lightning: method:",method)
+
             if isinstance(operation, Conditional):
                 if operation.meas_val.concretize(mid_measurements):
                     self._apply_lightning([operation.base])
@@ -219,8 +225,14 @@ def _apply_lightning(
                 # Inverse can be set to False since qml.matrix(operation) is already in
                 # inverted form
                 method = getattr(state, "applyMatrix")
-                try:
-                    method(qml.matrix(operation), wires, False)
-                except AttributeError:  # pragma: no cover
-                    # To support older versions of PL
-                    method(operation.matrix, wires, False)
+                print("statevector: _apply_lightning: method:",method)
+                print("statevector: _apply_lightning: matrix:",qml.matrix(operation))
+                # print("statevector: _apply_lightning: matrix:",operation.matrix)
+                
+                method(qml.matrix(operation), wires, False)
+                
+                # try:
+                #     method(qml.matrix(operation), wires, False)
+                # except AttributeError:  # pragma: no cover
+                #     # To support older versions of PL
+                #     method(operation.matrix, wires, False)
diff --git a/pyproject.toml b/pyproject.toml
index 1666cc2427..4cda66723d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ description = "PennyLane-Lightning plugin"
 readme = "README.rst"
 requires-python = ">=3.9"
 classifiers = [ "Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Natural Language :: English", "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX", "Operating System :: POSIX :: Linux", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Physics",]
-dependencies = [ "pennylane>=0.37", "pennylane_lightning==0.38.0-dev34",]
+dependencies = [ "pennylane>=0.37", "pennylane_lightning==0.39.0-dev2",]
 dynamic = [ "version",]
 [[project.maintainers]]
 name = "Xanadu Quantum Technologies Inc."
diff --git a/tests/conftest.py b/tests/conftest.py
index 4cc0e4c5c0..94d6bd5267 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -192,7 +192,8 @@ def _device(wires, shots=None):
 # General LightningStateVector fixture, for any number of wires.
 @pytest.fixture(
     scope="function",
-    params=[np.complex64, np.complex128],
+    # params=[np.complex64, np.complex128],
+    params=[np.complex128],
 )
 def lightning_sv(request):
     def _statevector(num_wires):
diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index d82dc171b8..d95995993f 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -46,6 +46,7 @@
 
 
 def get_hermitian_matrix(n):
+    np.random.seed(33)
     H = np.random.rand(n, n) + 1.0j * np.random.rand(n, n)
     return H + np.conj(H).T
 
@@ -408,16 +409,17 @@ def calculate_reference(tape, lightning_sv):
         m = LightningMeasurements(statevector)
         return m.measure_final_state(tape)
 
-    @flaky(max_runs=2)
+    @flaky(max_runs=1)
     # @pytest.mark.parametrize("shots", [None, 200_000, [190_000, 190_000]])
     @pytest.mark.parametrize("shots", [None, 1_000_000])
     @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
+    # @pytest.mark.parametrize("measurement", [qml.probs])
     @pytest.mark.parametrize(
         "observable",
         (
-            # [0],
-            # [1, 2],
-            # [1, 0],
+            [0],
+            [1, 2],
+            [1, 0],
             qml.PauliX(0),
             qml.PauliY(1),
             qml.PauliZ(2),
@@ -481,6 +483,7 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
         tape = qml.tape.QuantumScript(ops, measurements, shots=shots)
 
         statevector = lightning_sv(n_qubits)
+        # print("dtype:",statevector.dtype)
         statevector = statevector.get_final_state(tape)
         m = LightningMeasurements(statevector)
 
@@ -502,6 +505,9 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
 
         expected = self.calculate_reference(tape, lightning_sv)
 
+        print("Result:")
+        print(f'expected: {expected}')
+        print(f'  result: {result}')
         # a few tests may fail in single precision, and hence we increase the tolerance
         if shots is None:
             assert np.allclose(result, expected, max(tol, 1.0e-4))
@@ -512,119 +518,120 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
             # allclose -> absolute(a - b) <= (atol + rtol * absolute(b))
             assert np.allclose(result, expected, rtol=rtol, atol=atol)
 
-    # @flaky(max_runs=10)
+    @flaky(max_runs=1)
     # @pytest.mark.parametrize("shots", [None, 100_000, (90_000, 90_000)])
-    # @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
-    # @pytest.mark.parametrize(
-    #     "obs0_",
-    #     (
-    #         qml.PauliX(0),
-    #         qml.PauliY(1),
-    #         qml.PauliZ(2),
-    #         qml.sum(qml.PauliX(0), qml.PauliY(0)),
-    #         qml.prod(qml.PauliX(0), qml.PauliY(1)),
-    #         qml.s_prod(2.0, qml.PauliX(0)),
-    #         qml.Hermitian(get_hermitian_matrix(2), wires=[0]),
-    #         qml.Hermitian(get_hermitian_matrix(2**2), wires=[2, 3]),
-    #         qml.Hamiltonian(
-    #             [1.0, 2.0, 3.0], [qml.PauliX(0), qml.PauliY(1), qml.PauliZ(2) @ qml.PauliZ(3)]
-    #         ),
-    #         qml.SparseHamiltonian(get_sparse_hermitian_matrix(2**4), wires=range(4)),
-    #     ),
-    # )
-    # @pytest.mark.parametrize(
-    #     "obs1_",
-    #     (
-    #         qml.PauliX(0),
-    #         qml.PauliY(1),
-    #         qml.PauliZ(2),
-    #         qml.sum(qml.PauliX(0), qml.PauliY(0)),
-    #         qml.prod(qml.PauliX(0), qml.PauliY(1)),
-    #         qml.s_prod(2.0, qml.PauliX(0)),
-    #         qml.Hermitian(get_hermitian_matrix(2), wires=[0]),
-    #         qml.Hermitian(get_hermitian_matrix(2**2), wires=[2, 3]),
-    #         qml.Hamiltonian(
-    #             [1.0, 2.0, 3.0], [qml.PauliX(0), qml.PauliY(1), qml.PauliZ(2) @ qml.PauliZ(3)]
-    #         ),
-    #         qml.SparseHamiltonian(get_sparse_hermitian_matrix(2**4), wires=range(4)),
-    #     ),
-    # )
-    # def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_sv, tol):
-    #     skip_list = (
-    #         qml.ops.Sum,
-    #         qml.ops.SProd,
-    #         qml.ops.Prod,
-    #         qml.Hamiltonian,
-    #         qml.SparseHamiltonian,
-    #     )
-    #     if measurement is qml.probs and (
-    #         isinstance(obs0_, skip_list) or isinstance(obs1_, skip_list)
-    #     ):
-    #         pytest.skip(
-    #             f"Observable of type {type(obs0_).__name__} is not supported for rotating probabilities."
-    #         )
-
-    #     rtol = 1.0e-2  # 1% of expected value as tolerance
-    #     if shots != None and measurement is qml.expval:
-    #         # Increase the number of shots
-    #         if isinstance(shots, int):
-    #             shots *= 10
-    #         else:
-    #             shots = [i * 10 for i in shots]
-
-    #         # Extra tolerance
-    #         rtol = 5.0e-2  # 5% of expected value as tolerance
-
-    #     n_qubits = 4
-    #     n_layers = 1
-    #     np.random.seed(0)
-    #     weights = np.random.rand(n_layers, n_qubits, 3)
-    #     ops = [qml.Hadamard(i) for i in range(n_qubits)]
-    #     ops += [qml.StronglyEntanglingLayers(weights, wires=range(n_qubits))]
-    #     measurements = [measurement(op=obs0_), measurement(op=obs1_)]
-    #     tape = qml.tape.QuantumScript(ops, measurements, shots=shots)
-
-    #     statevector = lightning_sv(n_qubits)
-    #     statevector = statevector.get_final_state(tape)
-    #     m = LightningMeasurements(statevector)
-
-    #     skip_list = (
-    #         qml.ops.Sum,
-    #         qml.Hamiltonian,
-    #         qml.SparseHamiltonian,
-    #     )
-    #     do_skip = measurement is qml.var and (
-    #         isinstance(obs0_, skip_list) or isinstance(obs1_, skip_list)
-    #     )
-    #     do_skip = do_skip or (
-    #         measurement is qml.expval
-    #         and (
-    #             isinstance(obs0_, qml.SparseHamiltonian) or isinstance(obs1_, qml.SparseHamiltonian)
-    #         )
-    #     )
-    #     do_skip = do_skip and shots is not None
-    #     if do_skip:
-    #         with pytest.raises(TypeError):
-    #             _ = m.measure_final_state(tape)
-    #         return
-    #     else:
-    #         result = m.measure_final_state(tape)
-
-    #     expected = self.calculate_reference(tape, lightning_sv)
-    #     if len(expected) == 1:
-    #         expected = expected[0]
-
-    #     assert isinstance(result, Sequence)
-    #     assert len(result) == len(expected)
-    #     # a few tests may fail in single precision, and hence we increase the tolerance
-    #     atol = tol if shots is None else max(tol, 1.0e-2)
-    #     rtol = max(tol, rtol)  # % of expected value as tolerance
-    #     for r, e in zip(result, expected):
-    #         if isinstance(shots, tuple) and isinstance(r[0], np.ndarray):
-    #             r = np.concatenate(r)
-    #             e = np.concatenate(e)
-    #         # allclose -> absolute(r - e) <= (atol + rtol * absolute(e))
-    #         assert np.allclose(r, e, atol=atol, rtol=rtol)
+    @pytest.mark.parametrize("shots", [None, 100_000])
+    @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
+    @pytest.mark.parametrize(
+        "obs0_",
+        (
+            qml.PauliX(0),
+            qml.PauliY(1),
+            qml.PauliZ(2),
+            qml.sum(qml.PauliX(0), qml.PauliY(0)),
+            qml.prod(qml.PauliX(0), qml.PauliY(1)),
+            qml.s_prod(2.0, qml.PauliX(0)),
+            qml.Hermitian(get_hermitian_matrix(2), wires=[0]),
+            qml.Hermitian(get_hermitian_matrix(2**2), wires=[2, 3]),
+            qml.Hamiltonian(
+                [1.0, 2.0, 3.0], [qml.PauliX(0), qml.PauliY(1), qml.PauliZ(2) @ qml.PauliZ(3)]
+            ),
+            qml.SparseHamiltonian(get_sparse_hermitian_matrix(2**4), wires=range(4)),
+        ),
+    )
+    @pytest.mark.parametrize(
+        "obs1_",
+        (
+            qml.PauliX(0),
+            qml.PauliY(1),
+            qml.PauliZ(2),
+            qml.sum(qml.PauliX(0), qml.PauliY(0)),
+            qml.prod(qml.PauliX(0), qml.PauliY(1)),
+            qml.s_prod(2.0, qml.PauliX(0)),
+            qml.Hermitian(get_hermitian_matrix(2), wires=[0]),
+            qml.Hermitian(get_hermitian_matrix(2**2), wires=[2, 3]),
+            qml.Hamiltonian(
+                [1.0, 2.0, 3.0], [qml.PauliX(0), qml.PauliY(1), qml.PauliZ(2) @ qml.PauliZ(3)]
+            ),
+            qml.SparseHamiltonian(get_sparse_hermitian_matrix(2**4), wires=range(4)),
+        ),
+    )
+    def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_sv, tol):
+        skip_list = (
+            qml.ops.Sum,
+            qml.ops.SProd,
+            qml.ops.Prod,
+            qml.Hamiltonian,
+            qml.SparseHamiltonian,
+        )
+        if measurement is qml.probs and (
+            isinstance(obs0_, skip_list) or isinstance(obs1_, skip_list)
+        ):
+            pytest.skip(
+                f"Observable of type {type(obs0_).__name__} is not supported for rotating probabilities."
+            )
+
+        rtol = 1.0e-2  # 1% of expected value as tolerance
+        if shots != None and measurement is qml.expval:
+            # Increase the number of shots
+            if isinstance(shots, int):
+                shots *= 10
+            else:
+                shots = [i * 10 for i in shots]
+
+            # Extra tolerance
+            rtol = 5.0e-2  # 5% of expected value as tolerance
+
+        n_qubits = 4
+        n_layers = 1
+        np.random.seed(0)
+        weights = np.random.rand(n_layers, n_qubits, 3)
+        ops = [qml.Hadamard(i) for i in range(n_qubits)]
+        ops += [qml.StronglyEntanglingLayers(weights, wires=range(n_qubits))]
+        measurements = [measurement(op=obs0_), measurement(op=obs1_)]
+        tape = qml.tape.QuantumScript(ops, measurements, shots=shots)
+
+        statevector = lightning_sv(n_qubits)
+        statevector = statevector.get_final_state(tape)
+        m = LightningMeasurements(statevector)
+
+        skip_list = (
+            qml.ops.Sum,
+            qml.Hamiltonian,
+            qml.SparseHamiltonian,
+        )
+        do_skip = measurement is qml.var and (
+            isinstance(obs0_, skip_list) or isinstance(obs1_, skip_list)
+        )
+        do_skip = do_skip or (
+            measurement is qml.expval
+            and (
+                isinstance(obs0_, qml.SparseHamiltonian) or isinstance(obs1_, qml.SparseHamiltonian)
+            )
+        )
+        do_skip = do_skip and shots is not None
+        if do_skip:
+            with pytest.raises(TypeError):
+                _ = m.measure_final_state(tape)
+            return
+        else:
+            result = m.measure_final_state(tape)
+
+        expected = self.calculate_reference(tape, lightning_sv)
+        if len(expected) == 1:
+            expected = expected[0]
+
+        assert isinstance(result, Sequence)
+        assert len(result) == len(expected)
+        # a few tests may fail in single precision, and hence we increase the tolerance
+        atol = tol if shots is None else max(tol, 1.0e-2)
+        rtol = max(tol, rtol)  # % of expected value as tolerance
+        for r, e in zip(result, expected):
+            if isinstance(shots, tuple) and isinstance(r[0], np.ndarray):
+                r = np.concatenate(r)
+                e = np.concatenate(e)
+            # allclose -> absolute(r - e) <= (atol + rtol * absolute(e))
+            assert np.allclose(r, e, atol=atol, rtol=rtol)
 
     # @pytest.mark.parametrize(
     #     "cases",
diff --git a/tests/test_measurements.py b/tests/test_measurements.py
index be8496587b..c094c378f6 100644
--- a/tests/test_measurements.py
+++ b/tests/test_measurements.py
@@ -28,8 +28,8 @@
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
+# if device_name == "lightning.gpu":
+#     pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
 
 
 @pytest.mark.skipif(ld._new_API, reason="Old API required")

From a1ff6c606701af5f5e28e11d6798cf92db451034 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Tue, 10 Sep 2024 22:37:11 +0000
Subject: [PATCH 06/41] ready measurenment class for LGPU

---
 .../core/_measurements_base.py                | 45 +++++++--------
 .../measurements/MeasurementsGPU.hpp          |  3 +-
 .../lightning_gpu/_measurements.py            | 29 +++++++++-
 .../lightning_qubit/_measurements.py          | 25 ++++++++-
 .../lightning_qubit/_state_vector.py          | 22 ++------
 .../test_measurements_class.py                | 56 ++++++++++---------
 6 files changed, 107 insertions(+), 73 deletions(-)

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index dabec30eda..6dc4f1b75a 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -130,6 +130,7 @@ def expval(self, measurementprocess: MeasurementProcess):
             measurementprocess.obs.name, measurementprocess.obs.wires
         )
 
+    @abstractmethod
     def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
@@ -139,35 +140,27 @@ def probs(self, measurementprocess: MeasurementProcess):
         Returns:
             Probabilities of the supplied observable or wires
         """
-        diagonalizing_gates = measurementprocess.diagonalizing_gates()
-        # print('*'*100)
-        # print("probs: diagonalizing_gates:", diagonalizing_gates)
-        if diagonalizing_gates:
-            self._qubit_state.apply_operations(diagonalizing_gates)
-        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+        # diagonalizing_gates = measurementprocess.diagonalizing_gates()
+        # # print('*'*100)
+        # # print("probs: diagonalizing_gates:", diagonalizing_gates)
+        # if diagonalizing_gates:
+        #     self._qubit_state.apply_operations(diagonalizing_gates)
+        # results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
         
-        # print("probs: result:",results)
-        # print('*'*100)
-        if diagonalizing_gates:
-            self._qubit_state.apply_operations(
-                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
-            )
-            
-        if len(results) > 0:
-            num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
-            return results.reshape([2] * num_local_wires).transpose().reshape(-1)
+        # # print("probs: result:",results)
+        # # print('*'*100)
+        # if diagonalizing_gates:
+        #     self._qubit_state.apply_operations(
+        #         [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
+        #     )
+        
+        # # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
+        # if len(results) > 0:
+        #     num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
+        #     return results.reshape([2] * num_local_wires).transpose().reshape(-1)
 
-        return results
+        # return results
     
-        # translate to wire labels used by device
-        observable_wires = self.map_wires(wires)
-        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
-        local_prob = self.measurements.probs(observable_wires)
-        if len(local_prob) > 0:
-            num_local_wires = len(local_prob).bit_length() - 1 if len(local_prob) > 0 else 0
-            return local_prob.reshape([2] * num_local_wires).transpose().reshape(-1)
-        return local_prob
-
 
     def var(self, measurementprocess: MeasurementProcess):
         """Variance of the supplied observable contained in the MeasurementProcess.
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
index 460a4fa8cb..12d99dedd9 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
@@ -273,7 +273,8 @@ class Measurements final
         PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerSample(
             this->_statevector.getCusvHandle(), sampler, bitStrings.data(),
             bitOrdering.data(), bitStringLen, rand_nums.data(), num_samples,
-            CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER));
+            // CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER));
+            CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
         PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
             this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index ac7598ed07..3839c5fd41 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -32,7 +32,7 @@
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import CountsMP, SampleMeasurement, Shots
+from pennylane.measurements import CountsMP, SampleMeasurement, Shots, MeasurementProcess
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
@@ -120,3 +120,30 @@ def _process_single_shot(samples):
         return (
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
+
+    def probs(self, measurementprocess: MeasurementProcess):
+        """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
+
+        Args:
+            measurementprocess (StateMeasurement): measurement to apply to the state
+
+        Returns:
+            Probabilities of the supplied observable or wires
+        """
+        diagonalizing_gates = measurementprocess.diagonalizing_gates()
+
+        if diagonalizing_gates:
+            self._qubit_state.apply_operations(diagonalizing_gates)
+        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+        
+        if diagonalizing_gates:
+            self._qubit_state.apply_operations(
+                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
+            )
+        
+        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
+        if len(results) > 0:
+            num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
+            return results.reshape([2] * num_local_wires).transpose().reshape(-1)
+
+        return results
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index f762fcb7e6..214e57f351 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -26,7 +26,7 @@
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import CountsMP, SampleMeasurement, Shots
+from pennylane.measurements import CountsMP, SampleMeasurement, Shots, MeasurementProcess
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
@@ -141,3 +141,26 @@ def _process_single_shot(samples):
         return (
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
+
+    def probs(self, measurementprocess: MeasurementProcess):
+        """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
+
+        Args:
+            measurementprocess (StateMeasurement): measurement to apply to the state
+
+        Returns:
+            Probabilities of the supplied observable or wires
+        """
+        diagonalizing_gates = measurementprocess.diagonalizing_gates()
+
+        if diagonalizing_gates:
+            self._qubit_state.apply_operations(diagonalizing_gates)
+
+        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+        
+        if diagonalizing_gates:
+            self._qubit_state.apply_operations(
+                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
+            )
+        
+        return results
diff --git a/pennylane_lightning/lightning_qubit/_state_vector.py b/pennylane_lightning/lightning_qubit/_state_vector.py
index 08a0c107fa..4e47b2fb66 100644
--- a/pennylane_lightning/lightning_qubit/_state_vector.py
+++ b/pennylane_lightning/lightning_qubit/_state_vector.py
@@ -192,15 +192,9 @@ def _apply_lightning(
             else:
                 name = operation.name
                 invert_param = False
-
             method = getattr(state, name, None)
             wires = list(operation.wires)
 
-            print("statevector: _apply_lightning:  state:",state.__dir__)
-            print("statevector: _apply_lightning:    ops:",operation)
-            print("statevector: _apply_lightning:   name:",name)
-            print("statevector: _apply_lightning: method:",method)
-
             if isinstance(operation, Conditional):
                 if operation.meas_val.concretize(mid_measurements):
                     self._apply_lightning([operation.base])
@@ -225,14 +219,8 @@ def _apply_lightning(
                 # Inverse can be set to False since qml.matrix(operation) is already in
                 # inverted form
                 method = getattr(state, "applyMatrix")
-                print("statevector: _apply_lightning: method:",method)
-                print("statevector: _apply_lightning: matrix:",qml.matrix(operation))
-                # print("statevector: _apply_lightning: matrix:",operation.matrix)
-                
-                method(qml.matrix(operation), wires, False)
-                
-                # try:
-                #     method(qml.matrix(operation), wires, False)
-                # except AttributeError:  # pragma: no cover
-                #     # To support older versions of PL
-                #     method(operation.matrix, wires, False)
+                try:
+                    method(qml.matrix(operation), wires, False)
+                except AttributeError:  # pragma: no cover
+                    # To support older versions of PL
+                    method(operation.matrix, wires, False)
diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index d95995993f..f32ee39b18 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -410,10 +410,8 @@ def calculate_reference(tape, lightning_sv):
         return m.measure_final_state(tape)
 
     @flaky(max_runs=1)
-    # @pytest.mark.parametrize("shots", [None, 200_000, [190_000, 190_000]])
-    @pytest.mark.parametrize("shots", [None, 1_000_000])
+    @pytest.mark.parametrize("shots", [None, 600_000, [790_000, 790_000]])
     @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
-    # @pytest.mark.parametrize("measurement", [qml.probs])
     @pytest.mark.parametrize(
         "observable",
         (
@@ -519,8 +517,8 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
             assert np.allclose(result, expected, rtol=rtol, atol=atol)
 
     @flaky(max_runs=1)
-    # @pytest.mark.parametrize("shots", [None, 100_000, (90_000, 90_000)])
-    @pytest.mark.parametrize("shots", [None, 100_000])
+    @pytest.mark.parametrize("shots", [None, 100_000, (90_000, 90_000)])
+    # @pytest.mark.parametrize("shots", [None, 100_000])
     @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
     @pytest.mark.parametrize(
         "obs0_",
@@ -633,28 +631,32 @@ def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_s
             # allclose -> absolute(r - e) <= (atol + rtol * absolute(e))
             assert np.allclose(r, e, atol=atol, rtol=rtol)
 
-    # @pytest.mark.parametrize(
-    #     "cases",
-    #     [
-    #         [[0, 1], [1, 0]],
-    #         [[1, 0], [0, 1]],
-    #     ],
-    # )
-    # def test_probs_tape_unordered_wires(self, cases, tol):
-    #     """Test probs with a circuit on wires=[0] fails for out-of-order wires passed to probs."""
-
-    #     x, y, z = [0.5, 0.3, -0.7]
-    #     dev = qml.device(device_name, wires=cases[1])
-
-    #     def circuit():
-    #         qml.RX(0.4, wires=[0])
-    #         qml.Rot(x, y, z, wires=[0])
-    #         qml.RY(-0.2, wires=[0])
-    #         return qml.probs(wires=cases[0])
-
-    #     expected = qml.QNode(circuit, qml.device("default.qubit", wires=cases[1]))()
-    #     results = qml.QNode(circuit, dev)()
-    #     assert np.allclose(expected, results, tol)
+    @pytest.mark.skipif(
+        device_name == "lightning.gpu",
+        reason="lightning.gpu does not support out of order prob.",
+    )
+    @pytest.mark.parametrize(
+        "cases",
+        [
+            [[0, 1], [1, 0]],
+            [[1, 0], [0, 1]],
+        ],
+    )
+    def test_probs_tape_unordered_wires(self, cases, tol):
+        """Test probs with a circuit on wires=[0] fails for out-of-order wires passed to probs."""
+
+        x, y, z = [0.5, 0.3, -0.7]
+        dev = qml.device(device_name, wires=cases[1])
+
+        def circuit():
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            return qml.probs(wires=cases[0])
+
+        expected = qml.QNode(circuit, qml.device("default.qubit", wires=cases[1]))()
+        results = qml.QNode(circuit, dev)()
+        assert np.allclose(expected, results, tol)
 
 
 class TestControlledOps:

From a0cfb1dd939fa5cac11d22cd9b6b9be4f024d0b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Tue, 10 Sep 2024 22:37:54 +0000
Subject: [PATCH 07/41] print helps for measurements

---
 .../lightning_kokkos/_measurements.py         | 59 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index 6e706614ba..d46eda8a06 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -25,7 +25,7 @@
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import CountsMP, SampleMeasurement, Shots
+from pennylane.measurements import CountsMP, SampleMeasurement, Shots, MeasurementProcess
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
@@ -93,6 +93,9 @@ def _process_single_shot(samples):
 
             return tuple(processed)
 
+        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: wires:",len(wires))
+        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: shot:",shots.total_shots)
+
         try:
             samples = self._measurement_lightning.generate_samples(
                 len(wires), shots.total_shots
@@ -102,16 +105,66 @@ def _process_single_shot(samples):
             if str(e) != "probabilities contain NaN":
                 raise e
             samples = qml.math.full((shots.total_shots, len(wires)), 0)
-
+            
         self._apply_diagonalizing_gates(mps, adjoint=True)
+        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: sample:")
+        unique, counts_uniq = np.unique(samples,axis=0, return_inverse=False, return_counts=True)
+        for val, c in zip(unique, counts_uniq):
+            print(val, c)
+        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: sample:",samples.shape)
+        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: sample:",samples.sum())
 
         # if there is a shot vector, use the shots.bins generator to
         # split samples w.r.t. the shots
         processed_samples = []
+        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: shots.bins:", list(shots.bins()))
         for lower, upper in shots.bins():
-            result = _process_single_shot(samples[..., lower:upper, :])
+            # result = _process_single_shot(samples[..., lower:upper, :])
+            tmp_sample = samples[..., lower:upper, :]
+            print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: tmp_sample:")
+            unique, counts_uniq = np.unique(tmp_sample,axis=0, return_inverse=False, return_counts=True)
+            for val, c in zip(unique, counts_uniq):
+                print(val, c)
+
+            print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: tmp_sample", tmp_sample.shape)
+            print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: tmp_sample", tmp_sample.sum())
+            result = _process_single_shot(tmp_sample)
+
             processed_samples.append(result)
+            
+            print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: result:", result)
 
+        print("I reach this place FDX")
         return (
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
+
+    def probs(self, measurementprocess: MeasurementProcess):
+        """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
+
+        Args:
+            measurementprocess (StateMeasurement): measurement to apply to the state
+
+        Returns:
+            Probabilities of the supplied observable or wires
+        """
+        diagonalizing_gates = measurementprocess.diagonalizing_gates()
+        # print('*'*100)
+        # print("probs: diagonalizing_gates:", diagonalizing_gates)
+        if diagonalizing_gates:
+            self._qubit_state.apply_operations(diagonalizing_gates)
+        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+        
+        # print("probs: result:",results)
+        # print('*'*100)
+        if diagonalizing_gates:
+            self._qubit_state.apply_operations(
+                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
+            )
+        
+        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
+        if len(results) > 0:
+            num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
+            return results.reshape([2] * num_local_wires).transpose().reshape(-1)
+
+        return results

From c7ac82dfc2bfeead1029c746c15a3659d6bc4e51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 09:04:26 -0400
Subject: [PATCH 08/41] grammar correction

---
 pennylane_lightning/lightning_gpu/_state_vector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 18a0f45fb2..fcc92fbbee 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -310,7 +310,7 @@ def _apply_lightning_midmeasure(
         Returns:
             None
         """
-        raise DeviceError("LightningGPU does not support Mid Circuit Measure.")
+        raise DeviceError("LightningGPU does not support Mid-circuit measurements.")
     
     def _apply_lightning(
         self, operations, mid_measurements: dict = None, postselect_mode: str = None

From 66279139c9e71b30774b31ea7c65f2feaaaf75c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 10:07:27 -0400
Subject: [PATCH 09/41] cleaning code

---
 .../core/_measurements_base.py                | 21 --------
 .../measurements/MeasurementsGPU.hpp          |  1 -
 .../lightning_gpu/_measurements.py            |  1 +
 .../lightning_gpu/_mpi_handler.py             | 18 ++++++-
 .../lightning_gpu/_state_vector.py            | 52 ++++---------------
 .../lightning_gpu/lightning_gpu.py            |  8 +--
 .../lightning_kokkos/_measurements.py         | 27 +---------
 tests/conftest.py                             |  3 +-
 .../test_measurements_class.py                | 16 +-----
 .../test_state_vector_class.py                |  2 -
 tests/test_measurements.py                    |  4 +-
 11 files changed, 37 insertions(+), 116 deletions(-)

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index 3c66109372..51538cd5ec 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -140,27 +140,6 @@ def probs(self, measurementprocess: MeasurementProcess):
         Returns:
             Probabilities of the supplied observable or wires
         """
-        # diagonalizing_gates = measurementprocess.diagonalizing_gates()
-        # # print('*'*100)
-        # # print("probs: diagonalizing_gates:", diagonalizing_gates)
-        # if diagonalizing_gates:
-        #     self._qubit_state.apply_operations(diagonalizing_gates)
-        # results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
-        
-        # # print("probs: result:",results)
-        # # print('*'*100)
-        # if diagonalizing_gates:
-        #     self._qubit_state.apply_operations(
-        #         [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
-        #     )
-        
-        # # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
-        # if len(results) > 0:
-        #     num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
-        #     return results.reshape([2] * num_local_wires).transpose().reshape(-1)
-
-        # return results
-    
 
     def var(self, measurementprocess: MeasurementProcess):
         """Variance of the supplied observable contained in the MeasurementProcess.
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
index 12d99dedd9..fe19b5d025 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
@@ -273,7 +273,6 @@ class Measurements final
         PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerSample(
             this->_statevector.getCusvHandle(), sampler, bitStrings.data(),
             bitOrdering.data(), bitStringLen, rand_nums.data(), num_samples,
-            // CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER));
             CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
         PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
             this->_statevector.getDataBuffer().getDevTag().getStreamID()));
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 3839c5fd41..a2bbfcfc9d 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -134,6 +134,7 @@ def probs(self, measurementprocess: MeasurementProcess):
 
         if diagonalizing_gates:
             self._qubit_state.apply_operations(diagonalizing_gates)
+
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
         
         if diagonalizing_gates:
diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index b8e39209c6..11271d139d 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -1,4 +1,19 @@
+# Copyright 2022-2023 Xanadu Quantum Technologies Inc.
 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains the :class:`~.LightningGPU_MPIHandler` class, a MPI handler to use LightningGPU device with multi-GPU on multi-node system.
+"""
 
 try:
     # pylint: disable=no-name-in-module
@@ -20,7 +35,7 @@ class LightningGPU_MPIHandler():
     
     MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
     
-    Use the MPI library is necessary to initialize different variables and methods to handle the data across  nodes and perform checks for memory allocation on each device. 
+    Use the MPI library is necessary to initialize different variables and methods to handle the data across nodes and perform checks for memory allocation on each device. 
     
     Args:
         mpi (bool): declare if the device will use the MPI support.
@@ -52,7 +67,6 @@ def __init__(self,
 
             if (mpi_buf_size > 0 
                 and (mpi_buf_size & (mpi_buf_size - 1))):
-
                 raise ValueError(f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2.")
             
             # After check if all MPI parameter are ok
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index fcc92fbbee..20fab56840 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -43,12 +43,11 @@
 from pennylane.wires import Wires
 from pennylane.measurements import MidMeasureMP
 from pennylane.ops import Conditional
-from pennylane import QuantumFunctionError, DeviceError
+from pennylane import DeviceError
 
 from pennylane_lightning.core._serialize import global_phase_diagonal
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
 
-from ._measurements import LightningGPUMeasurements
 from ._mpi_handler import LightningGPU_MPIHandler
 
 gate_cache_needs_hash = (
@@ -73,14 +72,14 @@ class LightningGPUStateVector(LightningBaseStateVector):
         device_name(string): state vector device name. Options: ["lightning.gpu"]
     """
 
-    def __init__(self, num_wires, dtype=np.complex128, device_name="lightning.gpu", 
+    def __init__(self, 
+                 num_wires, 
+                 dtype=np.complex128, 
+                 device_name="lightning.gpu", 
                  mpi_handler = None, 
                  sync=True,
                  ):
 
-        if device_name != "lightning.gpu":
-            raise DeviceError(f'The device name "{device_name}" is not a valid option.')
-
         super().__init__(num_wires, dtype)
 
         self._device_name = device_name
@@ -216,9 +215,10 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
         if isinstance(state, self._qubit_state.__class__):
             raise DeviceError("LightningGPU does not support allocate external state_vector.")
             
-            state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
-            state.getState(state_data)
-            state = state_data
+            # TODO
+            # state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
+            # state.getState(state_data)
+            # state = state_data
 
         state = self._asarray(state, dtype=self.dtype)  # this operation on host
         output_shape = [2] * self._num_local_wires
@@ -343,11 +343,6 @@ def _apply_lightning(
             method = getattr(state, name, None)
             wires = list(operation.wires)
             
-            # print("statevector: _apply_lightning:  state:",state.__dir__)
-            # print("statevector: _apply_lightning:    ops:",operation)
-            # print("statevector: _apply_lightning:   name:",name)
-            # print("statevector: _apply_lightning: method:",method)
-
             if isinstance(operation, Conditional):
                 if operation.meas_val.concretize(mid_measurements):
                     self._apply_lightning([operation.base])
@@ -373,13 +368,9 @@ def _apply_lightning(
                 
                 r_dtype = np.float32 if self.dtype == np.complex64 else np.float64
                 param = [[r_dtype(operation.hash)]] if isinstance(operation, gate_cache_needs_hash) else []
-                # param = []
                 if len(mat) == 0:
                     raise ValueError("Unsupported operation")
 
-                # print("statevector: _apply_lightning: method:",method)
-                # print("statevector: _apply_lightning: mat:", mat)
-
                 self._qubit_state.apply(
                     name,
                     wires,
@@ -388,28 +379,3 @@ def _apply_lightning(
                     mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
                 )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
                 
-                # ----------------------------------------------------------
-                # method = getattr(state, "applyMatrix")
-                # # print("statevector: _apply_lightning: method:",method)
-                # # print("statevector: _apply_lightning: matrix:",qml.matrix(operation))
-                # # print("statevector: _apply_lightning: matrix:",operation.matrix)
-                
-                # try:
-                #     mat = qml.matrix(operation)
-                # except AttributeError:  # pragma: no cover
-                #     # To support older versions of PL
-                #     mat = operation.matrix
-
-                # # mat = mat.ravel(order='C')
-                # # mat = mat.conjugate().transpose()
-                
-                # print("statevector: _apply_lightning: mat:", mat)
-                # method(mat.ravel(order="C"), wires, False)
-                
-                # # try:
-                # #     method(mat, wires, False)
-                # # except AttributeError:  # pragma: no cover
-                # #     # To support older versions of PL
-                # #     method(mat, wires, False)
-
-
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index a06c47f869..f9ef0048ac 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -20,7 +20,7 @@
 from ctypes.util import find_library
 from importlib import util as imp_util
 from pathlib import Path
-from typing import Optional, Tuple, Callable, Union
+from typing import Optional, Tuple Callable, Union
 
 import numpy as np
 import pennylane as qml
@@ -291,7 +291,7 @@ def __init__(  # pylint: disable=too-many-arguments
         )
 
         # Set the attributes to call the LightningGPU classes
-        self._set_Lightning_classes()
+        self._set_lightning_classes()
 
         # GPU specific options
         self._dp = DevPool()
@@ -303,7 +303,6 @@ def __init__(  # pylint: disable=too-many-arguments
         
         self._num_local_wires = self._mpi_handler.num_local_wires
 
-        print("FSDX")
         self._statevector = self.LightningStateVector(self.num_wires, dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync)
 
 
@@ -312,9 +311,10 @@ def name(self):
         """The name of the device."""
         return "lightning.gpu"
 
-    def _set_Lightning_classes(self):
+    def _set_lightning_classes(self):
         """Load the LightningStateVector, LightningMeasurements, LightningAdjointJacobian as class attribute"""
         self.LightningStateVector = LightningGPUStateVector
+        self.LightningMeasurements = LightningGPUMeasurements
 
     def _setup_execution_config(self, config):
         """
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index d46eda8a06..8070c26f80 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -93,9 +93,6 @@ def _process_single_shot(samples):
 
             return tuple(processed)
 
-        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: wires:",len(wires))
-        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: shot:",shots.total_shots)
-
         try:
             samples = self._measurement_lightning.generate_samples(
                 len(wires), shots.total_shots
@@ -105,36 +102,16 @@ def _process_single_shot(samples):
             if str(e) != "probabilities contain NaN":
                 raise e
             samples = qml.math.full((shots.total_shots, len(wires)), 0)
-            
+
         self._apply_diagonalizing_gates(mps, adjoint=True)
-        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: sample:")
-        unique, counts_uniq = np.unique(samples,axis=0, return_inverse=False, return_counts=True)
-        for val, c in zip(unique, counts_uniq):
-            print(val, c)
-        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: sample:",samples.shape)
-        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: sample:",samples.sum())
 
         # if there is a shot vector, use the shots.bins generator to
         # split samples w.r.t. the shots
         processed_samples = []
-        print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: shots.bins:", list(shots.bins()))
         for lower, upper in shots.bins():
-            # result = _process_single_shot(samples[..., lower:upper, :])
-            tmp_sample = samples[..., lower:upper, :]
-            print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: tmp_sample:")
-            unique, counts_uniq = np.unique(tmp_sample,axis=0, return_inverse=False, return_counts=True)
-            for val, c in zip(unique, counts_uniq):
-                print(val, c)
-
-            print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: tmp_sample", tmp_sample.shape)
-            print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: tmp_sample", tmp_sample.sum())
-            result = _process_single_shot(tmp_sample)
-
+            result = _process_single_shot(samples[..., lower:upper, :])
             processed_samples.append(result)
-            
-            print("Kokkos: Measurements: _measure_with_samples_diagonalizing_gates: result:", result)
 
-        print("I reach this place FDX")
         return (
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
diff --git a/tests/conftest.py b/tests/conftest.py
index 94d6bd5267..4cc0e4c5c0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -192,8 +192,7 @@ def _device(wires, shots=None):
 # General LightningStateVector fixture, for any number of wires.
 @pytest.fixture(
     scope="function",
-    # params=[np.complex64, np.complex128],
-    params=[np.complex128],
+    params=[np.complex64, np.complex128],
 )
 def lightning_sv(request):
     def _statevector(num_wires):
diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index f32ee39b18..5765af6d69 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -46,7 +46,6 @@
 
 
 def get_hermitian_matrix(n):
-    np.random.seed(33)
     H = np.random.rand(n, n) + 1.0j * np.random.rand(n, n)
     return H + np.conj(H).T
 
@@ -409,7 +408,7 @@ def calculate_reference(tape, lightning_sv):
         m = LightningMeasurements(statevector)
         return m.measure_final_state(tape)
 
-    @flaky(max_runs=1)
+    @flaky(max_runs=2)
     @pytest.mark.parametrize("shots", [None, 600_000, [790_000, 790_000]])
     @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
     @pytest.mark.parametrize(
@@ -450,12 +449,6 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
             pytest.skip(
                 f"Measurement of type {type(measurement).__name__} does not have a keyword argument 'wires'."
             )
-            
-        print()
-        print("shots:",shots)
-        print("measurement:",measurement)
-        print("observable:", observable)
-            
         rtol = 1.0e-2  # 1% of expected value as tolerance
         if shots != None and measurement is qml.expval:
             # Increase the number of shots
@@ -481,7 +474,6 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
         tape = qml.tape.QuantumScript(ops, measurements, shots=shots)
 
         statevector = lightning_sv(n_qubits)
-        # print("dtype:",statevector.dtype)
         statevector = statevector.get_final_state(tape)
         m = LightningMeasurements(statevector)
 
@@ -503,9 +495,6 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
 
         expected = self.calculate_reference(tape, lightning_sv)
 
-        print("Result:")
-        print(f'expected: {expected}')
-        print(f'  result: {result}')
         # a few tests may fail in single precision, and hence we increase the tolerance
         if shots is None:
             assert np.allclose(result, expected, max(tol, 1.0e-4))
@@ -516,9 +505,8 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
             # allclose -> absolute(a - b) <= (atol + rtol * absolute(b))
             assert np.allclose(result, expected, rtol=rtol, atol=atol)
 
-    @flaky(max_runs=1)
+    @flaky(max_runs=10)
     @pytest.mark.parametrize("shots", [None, 100_000, (90_000, 90_000)])
-    # @pytest.mark.parametrize("shots", [None, 100_000])
     @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
     @pytest.mark.parametrize(
         "obs0_",
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index bdfc1e2b3f..6e562d80cc 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -42,8 +42,6 @@
         allow_module_level=True,
     )
 
-# if device_name == "lightning.gpu":
-#     pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
 
 if not LightningDevice._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
diff --git a/tests/test_measurements.py b/tests/test_measurements.py
index c094c378f6..be8496587b 100644
--- a/tests/test_measurements.py
+++ b/tests/test_measurements.py
@@ -28,8 +28,8 @@
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
-# if device_name == "lightning.gpu":
-#     pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
+if device_name == "lightning.gpu":
+    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
 
 
 @pytest.mark.skipif(ld._new_API, reason="Old API required")

From 629095304b36f7017b7ddfc8e38d4ffa6a3fb427 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 10:09:42 -0400
Subject: [PATCH 10/41] apply format

---
 .../lightning_gpu/_measurements.py            | 10 +--
 .../lightning_gpu/_mpi_handler.py             | 72 ++++++++++---------
 .../lightning_gpu/_state_vector.py            | 65 ++++++++---------
 .../lightning_gpu/lightning_gpu.py            | 39 +++++-----
 .../lightning_kokkos/_measurements.py         |  6 +-
 .../lightning_qubit/_measurements.py          |  6 +-
 tests/conftest.py                             |  6 +-
 .../test_state_vector_class.py                |  8 ++-
 8 files changed, 107 insertions(+), 105 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index a2bbfcfc9d..82ccb9e30f 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -17,9 +17,9 @@
 
 try:
     from pennylane_lightning.lightning_gpu_ops import MeasurementsC64, MeasurementsC128
-    
+
     try:
-        from pennylane_lightning.lightning_gpu_ops import MeasurementsMPIC64,MeasurementsMPIC128
+        from pennylane_lightning.lightning_gpu_ops import MeasurementsMPIC64, MeasurementsMPIC128
 
         MPI_SUPPORT = True
     except ImportError:
@@ -32,7 +32,7 @@
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import CountsMP, SampleMeasurement, Shots, MeasurementProcess
+from pennylane.measurements import CountsMP, MeasurementProcess, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
@@ -136,12 +136,12 @@ def probs(self, measurementprocess: MeasurementProcess):
             self._qubit_state.apply_operations(diagonalizing_gates)
 
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
-        
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
-        
+
         # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
         if len(results) > 0:
             num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index 11271d139d..46c7e81cc8 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -17,10 +17,8 @@
 
 try:
     # pylint: disable=no-name-in-module
-    from pennylane_lightning.lightning_gpu_ops import (
-        DevTag,
-        MPIManager,
-    )            
+    from pennylane_lightning.lightning_gpu_ops import DevTag, MPIManager
+
     MPI_SUPPORT = True
 except ImportError:
     MPI_SUPPORT = False
@@ -29,46 +27,50 @@
 
 import numpy as np
 
+
 # MPI options
-class LightningGPU_MPIHandler():
-    """MPI handler for PennyLane Lightning GPU device  
-    
+class LightningGPU_MPIHandler:
+    """MPI handler for PennyLane Lightning GPU device
+
     MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
-    
-    Use the MPI library is necessary to initialize different variables and methods to handle the data across nodes and perform checks for memory allocation on each device. 
-    
+
+    Use the MPI library is necessary to initialize different variables and methods to handle the data across nodes and perform checks for memory allocation on each device.
+
     Args:
         mpi (bool): declare if the device will use the MPI support.
         mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
         dev_pool (Callable): Method to handle the GPU devices available.
-        num_wires (int): the number of wires to initialize the device wit.h 
+        num_wires (int): the number of wires to initialize the device wit.h
         c_dtype (np.complex64, np.complex128): Datatypes for statevector representation
-        
+
     """
 
-    def __init__(self, 
-                 mpi: bool, 
-                 mpi_buf_size: int, 
-                 dev_pool: Callable, 
-                 num_wires: int, 
-                 c_dtype: Union[np.complex64, np.complex128]) -> None:
-        
+    def __init__(
+        self,
+        mpi: bool,
+        mpi_buf_size: int,
+        dev_pool: Callable,
+        num_wires: int,
+        c_dtype: Union[np.complex64, np.complex128],
+    ) -> None:
+
         self.use_mpi = mpi
         self.mpi_but_size = mpi_buf_size
         self._dp = dev_pool
-        
-        if self.use_mpi: 
-            
+
+        if self.use_mpi:
+
             if not MPI_SUPPORT:
                 raise ImportError("MPI related APIs are not found.")
 
             if mpi_buf_size < 0:
                 raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
 
-            if (mpi_buf_size > 0 
-                and (mpi_buf_size & (mpi_buf_size - 1))):
-                raise ValueError(f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2.")
-            
+            if mpi_buf_size > 0 and (mpi_buf_size & (mpi_buf_size - 1)):
+                raise ValueError(
+                    f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2."
+                )
+
             # After check if all MPI parameter are ok
             self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
 
@@ -76,44 +78,44 @@ def __init__(self,
             commSize = self._mpi_manager.getSize()
             self.num_global_wires = commSize.bit_length() - 1
             self.num_local_wires = num_wires - self._num_global_wires
-            
+
             # Memory size in bytes
             sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
             if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
                 raise ValueError("The MPI buffer size is larger than the local state vector size.")
 
-        if not self.use_mpi: 
+        if not self.use_mpi:
             self.num_local_wires = num_wires
             self.num_global_wires = num_wires
 
     def _mebibytesToBytes(mebibytes):
         return mebibytes * 1024 * 1024
-    
+
     def _mpi_init_helper(self, num_wires):
         """Set up MPI checks and initializations."""
-        
+
         # initialize MPIManager and config check in the MPIManager ctor
         mpi_manager = MPIManager()
-        
+
         # check if number of GPUs per node is larger than number of processes per node
         numDevices = self._dp.getTotalDevices()
         numProcsNode = mpi_manager.getSizeNode()
-        
+
         if numDevices < numProcsNode:
             raise ValueError(
                 "Number of devices should be larger than or equal to the number of processes on each node."
             )
-        
+
         # check if the process number is larger than number of statevector elements
         if mpi_manager.getSize() > (1 << (num_wires - 1)):
             raise ValueError(
                 "Number of processes should be smaller than the number of statevector elements."
             )
-        
+
         # set GPU device
         rank = self._mpi_manager.getRank()
         deviceid = rank % numProcsNode
         self._dp.setDeviceID(deviceid)
         devtag = DevTag(deviceid)
-        
+
         return (mpi_manager, devtag)
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 20fab56840..126f0d973d 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -15,22 +15,16 @@
 Class implementation for lightning_gpu state-vector manipulation.
 """
 try:
-    from pennylane_lightning.lightning_gpu_ops import (
-        StateVectorC64,
-        StateVectorC128,
-    )
-    
-    try: # Try to import the MPI modules
+    from pennylane_lightning.lightning_gpu_ops import StateVectorC64, StateVectorC128
+
+    try:  # Try to import the MPI modules
         # pylint: disable=no-name-in-module
-        from pennylane_lightning.lightning_gpu_ops import (
-            StateVectorMPIC64,
-            StateVectorMPIC128,
-        )
+        from pennylane_lightning.lightning_gpu_ops import StateVectorMPIC64, StateVectorMPIC128
 
         MPI_SUPPORT = True
     except ImportError:
         MPI_SUPPORT = False
-        
+
 except ImportError:
     pass
 
@@ -39,11 +33,10 @@
 import numpy as np
 import pennylane as qml
 from pennylane import DeviceError
-from pennylane.ops.op_math import Adjoint
-from pennylane.wires import Wires
 from pennylane.measurements import MidMeasureMP
 from pennylane.ops import Conditional
-from pennylane import DeviceError
+from pennylane.ops.op_math import Adjoint
+from pennylane.wires import Wires
 
 from pennylane_lightning.core._serialize import global_phase_diagonal
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
@@ -60,6 +53,7 @@
     qml.QubitUnitary,
 )
 
+
 class LightningGPUStateVector(LightningBaseStateVector):
     """Lightning GPU state-vector class.
 
@@ -72,18 +66,19 @@ class LightningGPUStateVector(LightningBaseStateVector):
         device_name(string): state vector device name. Options: ["lightning.gpu"]
     """
 
-    def __init__(self, 
-                 num_wires, 
-                 dtype=np.complex128, 
-                 device_name="lightning.gpu", 
-                 mpi_handler = None, 
-                 sync=True,
-                 ):
+    def __init__(
+        self,
+        num_wires,
+        dtype=np.complex128,
+        device_name="lightning.gpu",
+        mpi_handler=None,
+        sync=True,
+    ):
 
         super().__init__(num_wires, dtype)
 
         self._device_name = device_name
-        
+
         if mpi_handler is None:
             mpi_handler = LightningGPU_MPIHandler(False, 0, None, num_wires, dtype)
 
@@ -105,7 +100,7 @@ def __init__(self,
 
         if not self._mpi_handler.use_mpi:
             self._qubit_state = self._state_dtype()(self.num_wires)
-            
+
         self._create_basis_state(0)
 
     def _state_dtype(self):
@@ -139,7 +134,6 @@ def syncD2H(self, state_vector, use_async=False):
         [0.+0.j 1.+0.j]
         """
         self._qubit_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
-            
 
     @property
     def state(self):
@@ -158,7 +152,6 @@ def state(self):
         self.syncD2H(state)
         return state
 
-
     def syncH2D(self, state_vector, use_async=False):
         """Copy the state vector data on host provided by the user to the state vector on the device
         Args:
@@ -179,7 +172,7 @@ def syncH2D(self, state_vector, use_async=False):
         1.0
         """
         self._qubit_state.HostToDevice(state_vector.ravel(order="C"), use_async)
-    
+
     @staticmethod
     def _asarray(arr, dtype=None):
         arr = np.asarray(arr)  # arr is not copied
@@ -211,10 +204,10 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
         use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
         Note: This function only supports synchronized memory copy from host to device.
         """
-        
+
         if isinstance(state, self._qubit_state.__class__):
             raise DeviceError("LightningGPU does not support allocate external state_vector.")
-            
+
             # TODO
             # state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
             # state.getState(state_data)
@@ -294,7 +287,7 @@ def _apply_lightning_controlled(self, operation):
         wires = self.wires.indices(operation.wires)
         matrix = global_phase_diagonal(param, self.wires, control_wires, control_values)
         state.apply(name, wires, inv, [[param]], matrix)
-        
+
     def _apply_lightning_midmeasure(
         self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str
     ):
@@ -311,7 +304,7 @@ def _apply_lightning_midmeasure(
             None
         """
         raise DeviceError("LightningGPU does not support Mid-circuit measurements.")
-    
+
     def _apply_lightning(
         self, operations, mid_measurements: dict = None, postselect_mode: str = None
     ):
@@ -342,7 +335,7 @@ def _apply_lightning(
                 invert_param = False
             method = getattr(state, name, None)
             wires = list(operation.wires)
-            
+
             if isinstance(operation, Conditional):
                 if operation.meas_val.concretize(mid_measurements):
                     self._apply_lightning([operation.base])
@@ -364,10 +357,13 @@ def _apply_lightning(
                 except AttributeError:  # pragma: no cover
                     # To support older versions of PL
                     mat = operation.matrix
-                    
-                
+
                 r_dtype = np.float32 if self.dtype == np.complex64 else np.float64
-                param = [[r_dtype(operation.hash)]] if isinstance(operation, gate_cache_needs_hash) else []
+                param = (
+                    [[r_dtype(operation.hash)]]
+                    if isinstance(operation, gate_cache_needs_hash)
+                    else []
+                )
                 if len(mat) == 0:
                     raise ValueError("Unsupported operation")
 
@@ -378,4 +374,3 @@ def _apply_lightning(
                     param,
                     mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
                 )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
-                
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index f9ef0048ac..ad20da9e78 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -20,7 +20,7 @@
 from ctypes.util import find_library
 from importlib import util as imp_util
 from pathlib import Path
-from typing import Optional, Tuple Callable, Union
+from typing import Callable, Optional, Tuple, Union
 
 import numpy as np
 import pennylane as qml
@@ -37,9 +37,9 @@
     Result_or_ResultBatch,
 )
 
-from ._mpi_handler import LightningGPU_MPIHandler
 from ._adjoint_jacobian import LightningGPUAdjointJacobian
 from ._measurements import LightningGPUMeasurements
+from ._mpi_handler import LightningGPU_MPIHandler
 from ._state_vector import LightningGPUStateVector
 
 try:
@@ -54,11 +54,10 @@
 
     try:
         # pylint: disable=no-name-in-module
-        from pennylane_lightning.lightning_gpu_ops import (
-            DevTag,
-            MPIManager,
-        )
-        from ._mpi_handler import LightningGPU_MPIHandler            
+        from pennylane_lightning.lightning_gpu_ops import DevTag, MPIManager
+
+        from ._mpi_handler import LightningGPU_MPIHandler
+
         MPI_SUPPORT = True
     except ImportError:
         MPI_SUPPORT = False
@@ -150,6 +149,7 @@
     }
 )
 
+
 def stopping_condition(op: Operator) -> bool:
     """A function that determines whether or not an operation is supported by ``lightning.gpu``."""
     # To avoid building matrices beyond the given thresholds.
@@ -203,15 +203,15 @@ def _add_adjoint_transforms(program: TransformProgram) -> None:
     name = "adjoint + lightning.gpu"
     return 0
 
+
 def check_gpu_resources() -> None:
-    """ Check the available resources of each Nvidia GPU """
-    if (find_library("custatevec") is None 
-        and not imp_util.find_spec("cuquantum")):
-        
+    """Check the available resources of each Nvidia GPU"""
+    if find_library("custatevec") is None and not imp_util.find_spec("cuquantum"):
+
         raise ImportError(
             "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
         )
-        
+
     if not DevPool.getTotalDevices():
         raise ValueError("No supported CUDA-capable device found")
 
@@ -280,7 +280,7 @@ def __init__(  # pylint: disable=too-many-arguments
                 "To manually compile from source, follow the instructions at "
                 "https://docs.pennylane.ai/projects/lightning/en/stable/dev/installation.html."
             )
-            
+
         check_gpu_resources()
 
         super().__init__(
@@ -298,13 +298,16 @@ def __init__(  # pylint: disable=too-many-arguments
         self._sync = sync
 
         # Creating the state vector
-        
-        self._mpi_handler = LightningGPU_MPIHandler(mpi, mpi_buf_size, self._dp, self.num_wires, c_dtype)
-        
-        self._num_local_wires = self._mpi_handler.num_local_wires
 
-        self._statevector = self.LightningStateVector(self.num_wires, dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync)
+        self._mpi_handler = LightningGPU_MPIHandler(
+            mpi, mpi_buf_size, self._dp, self.num_wires, c_dtype
+        )
 
+        self._num_local_wires = self._mpi_handler.num_local_wires
+
+        self._statevector = self.LightningStateVector(
+            self.num_wires, dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
+        )
 
     @property
     def name(self):
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index 8070c26f80..5e43a9c8cb 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -25,7 +25,7 @@
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import CountsMP, SampleMeasurement, Shots, MeasurementProcess
+from pennylane.measurements import CountsMP, MeasurementProcess, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
@@ -131,14 +131,14 @@ def probs(self, measurementprocess: MeasurementProcess):
         if diagonalizing_gates:
             self._qubit_state.apply_operations(diagonalizing_gates)
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
-        
+
         # print("probs: result:",results)
         # print('*'*100)
         if diagonalizing_gates:
             self._qubit_state.apply_operations(
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
-        
+
         # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
         if len(results) > 0:
             num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index 214e57f351..6958f5b1e5 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -26,7 +26,7 @@
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import CountsMP, SampleMeasurement, Shots, MeasurementProcess
+from pennylane.measurements import CountsMP, MeasurementProcess, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
@@ -157,10 +157,10 @@ def probs(self, measurementprocess: MeasurementProcess):
             self._qubit_state.apply_operations(diagonalizing_gates)
 
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
-        
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
-        
+
         return results
diff --git a/tests/conftest.py b/tests/conftest.py
index 4cc0e4c5c0..ace8debfd9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -142,12 +142,12 @@ def get_device():
         from pennylane_lightning.lightning_kokkos_ops import LightningException
 elif device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
-    from pennylane_lightning.lightning_gpu._state_vector import (
-        LightningGPUStateVector as LightningStateVector,
-    )
     from pennylane_lightning.lightning_gpu._measurements import (
         LightningGPUMeasurements as LightningMeasurements,
     )
+    from pennylane_lightning.lightning_gpu._state_vector import (
+        LightningGPUStateVector as LightningStateVector,
+    )
 
     LightningAdjointJacobian = None
 
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index 6e562d80cc..b1bcdf1de1 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -29,7 +29,7 @@
         from pennylane_lightning.lightning_kokkos_ops import InitializationSettings
     except ImportError:
         pass
-    
+
 if device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu._mpi_handler import LightningGPU_MPIHandler
 
@@ -90,8 +90,10 @@ def test_apply_state_vector_with_lightning_handle(tol):
     state_vector_1 = LightningStateVector(2)
     state_vector_1.apply_operations([qml.BasisState(np.array([0, 1]), wires=[0, 1])])
 
-    if device_name == 'lightning.gpu':
-        with pytest.raises(qml.DeviceError, match="LightningGPU does not support allocate external state_vector."):
+    if device_name == "lightning.gpu":
+        with pytest.raises(
+            qml.DeviceError, match="LightningGPU does not support allocate external state_vector."
+        ):
             state_vector_2 = LightningStateVector(2)
             state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
 

From 219262b0884a08a17dac61255b9f63b42a878fdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 10:19:11 -0400
Subject: [PATCH 11/41] delete usuless variables

---
 pennylane_lightning/lightning_gpu/_state_vector.py | 2 +-
 pennylane_lightning/lightning_gpu/lightning_gpu.py | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 126f0d973d..17c61e60f6 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -232,7 +232,7 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
         basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
 
         # get basis states to alter on full set of qubits
-        unravelled_indices = np.zeros((2 ** len(device_wires), self.num_wires), dtype=int)
+        unravelled_indices = np.zeros((1 << len(device_wires), self.num_wires), dtype=int)
         unravelled_indices[:, device_wires] = basis_states
 
         # get indices for which the state is changed to input state vector elements
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index ad20da9e78..e330568f48 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -298,13 +298,10 @@ def __init__(  # pylint: disable=too-many-arguments
         self._sync = sync
 
         # Creating the state vector
-
         self._mpi_handler = LightningGPU_MPIHandler(
             mpi, mpi_buf_size, self._dp, self.num_wires, c_dtype
         )
 
-        self._num_local_wires = self._mpi_handler.num_local_wires
-
         self._statevector = self.LightningStateVector(
             self.num_wires, dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
         )

From bca1a7480db9124b693331165534f7ef7abdba80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 10:48:45 -0400
Subject: [PATCH 12/41] delete prints

---
 pennylane_lightning/lightning_kokkos/_measurements.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index 5e43a9c8cb..86c59595b7 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -126,22 +126,15 @@ def probs(self, measurementprocess: MeasurementProcess):
             Probabilities of the supplied observable or wires
         """
         diagonalizing_gates = measurementprocess.diagonalizing_gates()
-        # print('*'*100)
-        # print("probs: diagonalizing_gates:", diagonalizing_gates)
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(diagonalizing_gates)
+
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
 
-        # print("probs: result:",results)
-        # print('*'*100)
         if diagonalizing_gates:
             self._qubit_state.apply_operations(
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
 
-        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
-        if len(results) > 0:
-            num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
-            return results.reshape([2] * num_local_wires).transpose().reshape(-1)
-
         return results

From 0f8f957a008db1fd668e483ea02c560cd4ef5350 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 11:07:42 -0400
Subject: [PATCH 13/41] Revert change in measurenment test

---
 .../test_measurements_class.py                | 37 +++++++------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index 5765af6d69..f89786b8c5 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -408,8 +408,8 @@ def calculate_reference(tape, lightning_sv):
         m = LightningMeasurements(statevector)
         return m.measure_final_state(tape)
 
-    @flaky(max_runs=2)
-    @pytest.mark.parametrize("shots", [None, 600_000, [790_000, 790_000]])
+    @flaky(max_runs=5)
+    @pytest.mark.parametrize("shots", [None, 500_000, [500_000, 500_000]])
     @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
     @pytest.mark.parametrize(
         "observable",
@@ -449,16 +449,12 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
             pytest.skip(
                 f"Measurement of type {type(measurement).__name__} does not have a keyword argument 'wires'."
             )
-        rtol = 1.0e-2  # 1% of expected value as tolerance
         if shots != None and measurement is qml.expval:
             # Increase the number of shots
             if isinstance(shots, int):
-                shots *= 10
+                shots = 1_000_000
             else:
-                shots = [i * 10 for i in shots]
-
-            # Extra tolerance
-            rtol = 5.0e-2  # 5% of expected value as tolerance
+                shots = [1_000_000, 1_000_000]
 
         n_qubits = 4
         n_layers = 1
@@ -499,14 +495,13 @@ def test_single_return_value(self, shots, measurement, observable, lightning_sv,
         if shots is None:
             assert np.allclose(result, expected, max(tol, 1.0e-4))
         else:
-            atol = max(tol, 1.0e-2) if statevector.dtype == np.complex64 else max(tol, 1.0e-3)
-            rtol = max(tol, rtol)  # % of expected value as tolerance
-
+            # TODO Set better atol and rtol
+            dtol = max(tol, 1.0e-2)
             # allclose -> absolute(a - b) <= (atol + rtol * absolute(b))
-            assert np.allclose(result, expected, rtol=rtol, atol=atol)
+            assert np.allclose(result, expected, rtol=dtol, atol=dtol)
 
-    @flaky(max_runs=10)
-    @pytest.mark.parametrize("shots", [None, 100_000, (90_000, 90_000)])
+    @flaky(max_runs=5)
+    @pytest.mark.parametrize("shots", [None, 400_000, (400_000, 400_000)])
     @pytest.mark.parametrize("measurement", [qml.expval, qml.probs, qml.var])
     @pytest.mark.parametrize(
         "obs0_",
@@ -557,16 +552,12 @@ def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_s
                 f"Observable of type {type(obs0_).__name__} is not supported for rotating probabilities."
             )
 
-        rtol = 1.0e-2  # 1% of expected value as tolerance
         if shots != None and measurement is qml.expval:
             # Increase the number of shots
             if isinstance(shots, int):
-                shots *= 10
+                shots = 1_000_000
             else:
-                shots = [i * 10 for i in shots]
-
-            # Extra tolerance
-            rtol = 5.0e-2  # 5% of expected value as tolerance
+                shots = [1_000_000, 1_000_000]
 
         n_qubits = 4
         n_layers = 1
@@ -610,14 +601,14 @@ def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_s
         assert isinstance(result, Sequence)
         assert len(result) == len(expected)
         # a few tests may fail in single precision, and hence we increase the tolerance
-        atol = tol if shots is None else max(tol, 1.0e-2)
-        rtol = max(tol, rtol)  # % of expected value as tolerance
+        dtol = tol if shots is None else max(tol, 1.0e-2)
+        # TODO Set better atol and rtol
         for r, e in zip(result, expected):
             if isinstance(shots, tuple) and isinstance(r[0], np.ndarray):
                 r = np.concatenate(r)
                 e = np.concatenate(e)
             # allclose -> absolute(r - e) <= (atol + rtol * absolute(e))
-            assert np.allclose(r, e, atol=atol, rtol=rtol)
+            assert np.allclose(r, e, atol=dtol, rtol=dtol)
 
     @pytest.mark.skipif(
         device_name == "lightning.gpu",

From 0399f18802c97ceaf2820bf2f091f8d20da75805 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 15:44:43 +0000
Subject: [PATCH 14/41] add simulate method

---
 .../lightning_gpu/lightning_gpu.py            | 34 +++++++++++++++++--
 tests/lightning_qubit/test_simulate_method.py |  3 --
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index edf95455ea..c60cf2f711 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -28,6 +28,7 @@
 from pennylane.devices import DefaultExecutionConfig, ExecutionConfig
 from pennylane.devices.modifiers import simulator_tracking, single_tape_support
 from pennylane.operation import Operator
+from pennylane.measurements import MidMeasureMP
 from pennylane.tape import QuantumScript, QuantumTape
 from pennylane.transforms.core import TransformProgram
 from pennylane.typing import Result
@@ -303,11 +304,14 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # Creating the state vector
         self._mpi_handler = LightningGPU_MPIHandler(
-            mpi, mpi_buf_size, self._dp, self.num_wires, c_dtype
+            mpi, mpi_buf_size, self._dp, len(self.wires), c_dtype
         )
 
         self._statevector = self.LightningStateVector(
-            self.num_wires, dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
+            num_wires=len(self.wires), 
+            dtype=c_dtype, 
+            mpi_handler=self._mpi_handler, 
+            sync=self._sync
         )
 
     @property
@@ -402,4 +406,28 @@ def simulate(
 
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
-        return 0
+        if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
+            results = []
+            aux_circ = qml.tape.QuantumScript(
+                circuit.operations,
+                circuit.measurements,
+                shots=[1],
+                trainable_params=circuit.trainable_params,
+            )
+            for _ in range(circuit.shots.total_shots):
+                state.reset_state()
+                mid_measurements = {}
+                final_state = state.get_final_state(
+                    aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode
+                )
+                results.append(
+                    LightningGPUMeasurements(final_state).measure_final_state(
+                        aux_circ, mid_measurements=mid_measurements
+                    )
+                )
+            return tuple(results)
+
+        state.reset_state()
+        final_state = state.get_final_state(circuit)
+        return LightningGPUMeasurements(final_state).measure_final_state(circuit)
+
diff --git a/tests/lightning_qubit/test_simulate_method.py b/tests/lightning_qubit/test_simulate_method.py
index 9dfecb64d3..ff5536846d 100644
--- a/tests/lightning_qubit/test_simulate_method.py
+++ b/tests/lightning_qubit/test_simulate_method.py
@@ -28,9 +28,6 @@
         allow_module_level=True,
     )
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
-
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 

From 23d56961d50bf2a64ff7fa41717dc501931e7b1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 15:51:49 +0000
Subject: [PATCH 15/41] apply format

---
 pennylane_lightning/lightning_gpu/lightning_gpu.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index c60cf2f711..c587648ef5 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -27,8 +27,8 @@
 import pennylane as qml
 from pennylane.devices import DefaultExecutionConfig, ExecutionConfig
 from pennylane.devices.modifiers import simulator_tracking, single_tape_support
-from pennylane.operation import Operator
 from pennylane.measurements import MidMeasureMP
+from pennylane.operation import Operator
 from pennylane.tape import QuantumScript, QuantumTape
 from pennylane.transforms.core import TransformProgram
 from pennylane.typing import Result
@@ -308,10 +308,7 @@ def __init__(  # pylint: disable=too-many-arguments
         )
 
         self._statevector = self.LightningStateVector(
-            num_wires=len(self.wires), 
-            dtype=c_dtype, 
-            mpi_handler=self._mpi_handler, 
-            sync=self._sync
+            num_wires=len(self.wires), dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
         )
 
     @property
@@ -430,4 +427,3 @@ def simulate(
         state.reset_state()
         final_state = state.get_final_state(circuit)
         return LightningGPUMeasurements(final_state).measure_final_state(circuit)
-

From 585c313361812da548e85cb68838961bc94df416 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 17:11:24 -0400
Subject: [PATCH 16/41] Shuli suggestion

---
 pennylane_lightning/lightning_gpu/_measurements.py | 11 ++++++++---
 pennylane_lightning/lightning_gpu/_mpi_handler.py  |  2 +-
 pennylane_lightning/lightning_gpu/_state_vector.py |  9 ++++++---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 82ccb9e30f..d3c93a7946 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -15,6 +15,7 @@
 Class implementation for state vector measurements.
 """
 
+from warnings import warn
 try:
     from pennylane_lightning.lightning_gpu_ops import MeasurementsC64, MeasurementsC128
 
@@ -22,10 +23,14 @@
         from pennylane_lightning.lightning_gpu_ops import MeasurementsMPIC64, MeasurementsMPIC128
 
         MPI_SUPPORT = True
-    except ImportError:
+    except ImportError as ex:
+        warn(str(ex), UserWarning)
+
         MPI_SUPPORT = False
 
-except ImportError:
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
     pass
 
 from typing import List
@@ -57,7 +62,7 @@ def __init__(
         self._measurement_lightning = self._measurement_dtype()(lgpu_state.state_vector)
 
     def _measurement_dtype(self):
-        """Binding to Lightning Kokkos Measurements C++ class.
+        """Binding to Lightning GPU Measurements C++ class.
 
         Returns: the Measurements class
         """
diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index 46c7e81cc8..7fe09ac5cd 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -1,4 +1,4 @@
-# Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+# Copyright 2022-2024 Xanadu Quantum Technologies Inc.
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 17c61e60f6..7958b694ad 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -14,6 +14,7 @@
 """
 Class implementation for lightning_gpu state-vector manipulation.
 """
+from warnings import warn
 try:
     from pennylane_lightning.lightning_gpu_ops import StateVectorC64, StateVectorC128
 
@@ -22,11 +23,13 @@
         from pennylane_lightning.lightning_gpu_ops import StateVectorMPIC64, StateVectorMPIC128
 
         MPI_SUPPORT = True
-    except ImportError:
+    except ImportError as ex:
+        warn(str(ex), UserWarning)
+
         MPI_SUPPORT = False
 
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 from itertools import product
 

From 5eee8eb8d42bd30ba0e44dc2bf00cb2642c4eac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 17:13:25 -0400
Subject: [PATCH 17/41] apply format

---
 pennylane_lightning/lightning_gpu/_measurements.py | 1 +
 pennylane_lightning/lightning_gpu/_state_vector.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index d3c93a7946..0417837cc0 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -16,6 +16,7 @@
 """
 
 from warnings import warn
+
 try:
     from pennylane_lightning.lightning_gpu_ops import MeasurementsC64, MeasurementsC128
 
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 7958b694ad..3fd7c5195f 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -15,6 +15,7 @@
 Class implementation for lightning_gpu state-vector manipulation.
 """
 from warnings import warn
+
 try:
     from pennylane_lightning.lightning_gpu_ops import StateVectorC64, StateVectorC128
 

From be3c6f736e1f6bf86467ccebf27b031755b9c89a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 17:28:59 -0400
Subject: [PATCH 18/41] Develop Jacobian

---
 .../lightning_gpu/_adjoint_jacobian.py        | 89 ++++++++++++++++++-
 1 file changed, 86 insertions(+), 3 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
index 822af1f916..7175c30f0f 100644
--- a/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
+++ b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
@@ -15,9 +15,35 @@
 Internal methods for adjoint Jacobian differentiation method.
 """
 
+from warnings import warn
+try:
+    from pennylane_lightning.lightning_gpu_ops.algorithms import (
+        AdjointJacobianC64,
+        AdjointJacobianC128,
+        create_ops_listC64,
+        create_ops_listC128,
+    )
+
+    try:
+        from pennylane_lightning.lightning_gpu_ops.algorithmsMPI import (
+            AdjointJacobianMPIC64,
+            AdjointJacobianMPIC128,
+            create_ops_listMPIC64,
+            create_ops_listMPIC128,
+        )
+
+        MPI_SUPPORT = True
+    except ImportError as ex:
+        warn(str(ex), UserWarning)
+        MPI_SUPPORT = False
+
+except (ImportError, ValueError) as ex:
+    warn(str(ex), UserWarning)
+
 import numpy as np
-import pennylane as qml
+from pennylane.tape import QuantumTape
 
+# pylint: disable=ungrouped-imports
 from pennylane_lightning.core._adjoint_jacobian_base import LightningBaseAdjointJacobian
 
 from ._state_vector import LightningGPUStateVector
@@ -31,5 +57,62 @@ class LightningGPUAdjointJacobian(LightningBaseAdjointJacobian):
         batch_obs(bool): If serialized tape is to be batched or not.
     """
 
-    def __init__(self, lgpu_state: LightningGPUStateVector, batch_obs: bool = False) -> None:
-        super().__init__(lgpu_state, batch_obs)
+    # pylint: disable=too-few-public-methods
+
+    def __init__(self, qubit_state: LightningGPUStateVector, batch_obs: bool = False) -> None:
+        super().__init__(qubit_state, batch_obs)
+        # Initialize the C++ binds
+        self._jacobian_lightning, self._create_ops_list_lightning = self._adjoint_jacobian_dtype()
+
+    def _adjoint_jacobian_dtype(self):
+        """Binding to Lightning GPU Adjoint Jacobian C++ class.
+
+        Returns: the AdjointJacobian class
+        """
+        jacobian_lightning = (
+            AdjointJacobianC64() if self.dtype == np.complex64 else AdjointJacobianC128()
+        )
+        create_ops_list_lightning = (
+            create_ops_listC64 if self.dtype == np.complex64 else create_ops_listC128
+        )
+        return jacobian_lightning, create_ops_list_lightning
+
+    def calculate_jacobian(self, tape: QuantumTape):
+        """Computes the Jacobian with the adjoint method.
+
+        .. code-block:: python
+
+            statevector = LightningGPUStateVector(num_wires=num_wires)
+            statevector = statevector.get_final_state(tape)
+            jacobian = LightningGPUAdjointJacobian(statevector).calculate_jacobian(tape)
+
+        Args:
+            tape (QuantumTape): Operations and measurements that represent instructions for execution on Lightning.
+
+        Returns:
+            The Jacobian of a tape.
+        """
+
+        empty_array = self._handle_raises(tape, is_jacobian=True)
+
+        if empty_array:
+            return np.array([], dtype=self.dtype)
+
+        processed_data = self._process_jacobian_tape(tape)
+
+        if not processed_data:  # training_params is empty
+            return np.array([], dtype=self.dtype)
+
+        trainable_params = processed_data["tp_shift"]
+        jac = self._jacobian_lightning(
+            processed_data["state_vector"],
+            processed_data["obs_serialized"],
+            processed_data["ops_serialized"],
+            trainable_params,
+        )
+        jac = np.array(jac)
+        jac = jac.reshape(-1, len(trainable_params)) if len(jac) else jac
+        jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
+        jac_r[:, processed_data["record_tp_rows"]] = jac
+
+        return self._adjoint_jacobian_processing(jac_r)

From 7ff13b33deaf6a662b7820dcc5e29407bfe9c049 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 21:47:44 +0000
Subject: [PATCH 19/41] unlock the test for jacobian and adjoint-jacobian

---
 pennylane_lightning/lightning_gpu/lightning_gpu.py   | 5 ++---
 tests/conftest.py                                    | 6 ++++--
 tests/lightning_qubit/test_adjoint_jacobian_class.py | 3 ---
 tests/lightning_qubit/test_jacobian_method.py        | 3 ---
 4 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index c587648ef5..a9db079511 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -55,9 +55,6 @@
     LGPU_CPP_BINARY_AVAILABLE = True
 
     try:
-        # pylint: disable=no-name-in-module
-        from pennylane_lightning.lightning_gpu_ops import DevTag, MPIManager
-
         from ._mpi_handler import LightningGPU_MPIHandler
 
         MPI_SUPPORT = True
@@ -320,6 +317,8 @@ def _set_lightning_classes(self):
         """Load the LightningStateVector, LightningMeasurements, LightningAdjointJacobian as class attribute"""
         self.LightningStateVector = LightningGPUStateVector
         self.LightningMeasurements = LightningGPUMeasurements
+        self.LightningAdjointJacobian = LightningGPUAdjointJacobian
+        
 
     def _setup_execution_config(self, config):
         """
diff --git a/tests/conftest.py b/tests/conftest.py
index ace8debfd9..1361ddfbc8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -142,6 +142,10 @@ def get_device():
         from pennylane_lightning.lightning_kokkos_ops import LightningException
 elif device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
+    from pennylane_lightning.lightning_gpu._adjoint_jacobian import (
+        LightningGPUAdjointJacobian as LightningAdjointJacobian,
+    )
+
     from pennylane_lightning.lightning_gpu._measurements import (
         LightningGPUMeasurements as LightningMeasurements,
     )
@@ -149,8 +153,6 @@ def get_device():
         LightningGPUStateVector as LightningStateVector,
     )
 
-    LightningAdjointJacobian = None
-
     if hasattr(pennylane_lightning, "lightning_gpu_ops"):
         import pennylane_lightning.lightning_gpu_ops as lightning_ops
         from pennylane_lightning.lightning_gpu_ops import LightningException
diff --git a/tests/lightning_qubit/test_adjoint_jacobian_class.py b/tests/lightning_qubit/test_adjoint_jacobian_class.py
index 199f2e4a8b..236c697f1d 100644
--- a/tests/lightning_qubit/test_adjoint_jacobian_class.py
+++ b/tests/lightning_qubit/test_adjoint_jacobian_class.py
@@ -34,9 +34,6 @@
         allow_module_level=True,
     )
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
-
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 
diff --git a/tests/lightning_qubit/test_jacobian_method.py b/tests/lightning_qubit/test_jacobian_method.py
index d4439ca2b1..745feee502 100644
--- a/tests/lightning_qubit/test_jacobian_method.py
+++ b/tests/lightning_qubit/test_jacobian_method.py
@@ -26,9 +26,6 @@
         allow_module_level=True,
     )
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
-
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 

From 3cec8b1584dc92cec1743b41d6eec62382f94cd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Wed, 11 Sep 2024 21:52:56 +0000
Subject: [PATCH 20/41] apply format

---
 pennylane_lightning/lightning_gpu/_adjoint_jacobian.py | 1 +
 pennylane_lightning/lightning_gpu/lightning_gpu.py     | 1 -
 tests/conftest.py                                      | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
index 7175c30f0f..4130c1082b 100644
--- a/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
+++ b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
@@ -16,6 +16,7 @@
 """
 
 from warnings import warn
+
 try:
     from pennylane_lightning.lightning_gpu_ops.algorithms import (
         AdjointJacobianC64,
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index a9db079511..533e298a8a 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -318,7 +318,6 @@ def _set_lightning_classes(self):
         self.LightningStateVector = LightningGPUStateVector
         self.LightningMeasurements = LightningGPUMeasurements
         self.LightningAdjointJacobian = LightningGPUAdjointJacobian
-        
 
     def _setup_execution_config(self, config):
         """
diff --git a/tests/conftest.py b/tests/conftest.py
index 1361ddfbc8..9c4e13c39b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -145,7 +145,6 @@ def get_device():
     from pennylane_lightning.lightning_gpu._adjoint_jacobian import (
         LightningGPUAdjointJacobian as LightningAdjointJacobian,
     )
-
     from pennylane_lightning.lightning_gpu._measurements import (
         LightningGPUMeasurements as LightningMeasurements,
     )

From 92089eb2172cee570c205aa82c2507255880b0c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Thu, 12 Sep 2024 11:11:19 -0400
Subject: [PATCH 21/41] Update
 pennylane_lightning/lightning_gpu/_mpi_handler.py

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
---
 pennylane_lightning/lightning_gpu/_mpi_handler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index 7fe09ac5cd..de9bbbd950 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -42,7 +42,6 @@ class LightningGPU_MPIHandler:
         dev_pool (Callable): Method to handle the GPU devices available.
         num_wires (int): the number of wires to initialize the device wit.h
         c_dtype (np.complex64, np.complex128): Datatypes for statevector representation
-
     """
 
     def __init__(

From 196042a979ea48f124b53c4a7bcbafb5dd0eb649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Thu, 12 Sep 2024 11:13:55 -0400
Subject: [PATCH 22/41] Apply suggestions from code review Vinvent's comments

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
---
 pennylane_lightning/lightning_gpu/_mpi_handler.py  | 4 ++--
 pennylane_lightning/lightning_gpu/_state_vector.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index de9bbbd950..23656ee518 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -40,7 +40,7 @@ class LightningGPU_MPIHandler:
         mpi (bool): declare if the device will use the MPI support.
         mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
         dev_pool (Callable): Method to handle the GPU devices available.
-        num_wires (int): the number of wires to initialize the device wit.h
+        num_wires (int): the number of wires to initialize the device with.
         c_dtype (np.complex64, np.complex128): Datatypes for statevector representation
     """
 
@@ -70,7 +70,7 @@ def __init__(
                     f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2."
                 )
 
-            # After check if all MPI parameter are ok
+            # After check if all MPI parameters are ok
             self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
 
             # set the number of global and local wires
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 3fd7c5195f..ff960cf93e 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -123,9 +123,9 @@ def reset_state(self):
         self._qubit_state.resetGPU(False)  # Sync reset
 
     def syncD2H(self, state_vector, use_async=False):
-        """Copy the state vector data on device to a state vector on the host provided by the user
+        """Copy the state vector data on device to a state vector on the host provided by the user.
         Args:
-            state_vector(array[complex]): the state vector array on host
+            state_vector(array[complex]): the state vector array on host.
             use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
             Note: This function only supports synchronized memory copy.
 

From b4ed1ae3cd6fa0ed3eb626b3b9351e1b75f93582 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Thu, 12 Sep 2024 16:00:07 +0000
Subject: [PATCH 23/41] Vincent's comments

---
 .../lightning_gpu/_state_vector.py            |  7 ++----
 .../lightning_gpu/lightning_gpu.py            | 22 ++-----------------
 2 files changed, 4 insertions(+), 25 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 3fd7c5195f..942714f8f2 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -190,7 +190,7 @@ def _asarray(arr, dtype=None):
         return arr
 
     def _create_basis_state(self, index, use_async=False):
-        """Return a computational basis state over all wires.
+        """Creates a computational basis state consisting of 0s and 1s, over all wires on device.
         Args:
             index (int): integer representing the computational basis state.
             use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
@@ -255,9 +255,6 @@ def _apply_basis_state(self, state, wires):
             wires (Wires): wires that the provided computational state should be initialized on
         Note: This function does not support broadcasted inputs yet.
         """
-        # translate to wire labels used by device
-        device_wires = wires
-
         if not set(state.tolist()).issubset({0, 1}):
             raise ValueError("BasisState parameter must consist of 0 or 1 integers.")
 
@@ -265,7 +262,7 @@ def _apply_basis_state(self, state, wires):
             raise ValueError("BasisState parameter and wires must be of equal length.")
 
         # get computational basis state number
-        basis_states = 1 << (self.num_wires - 1 - np.array(device_wires))
+        basis_states = 1 << (self.num_wires - 1 - np.array(wires))
         basis_states = qml.math.convert_like(basis_states, state)
         num = int(qml.math.dot(state, basis_states))
 
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index c587648ef5..41fd1d097f 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -404,26 +404,8 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
-            results = []
-            aux_circ = qml.tape.QuantumScript(
-                circuit.operations,
-                circuit.measurements,
-                shots=[1],
-                trainable_params=circuit.trainable_params,
-            )
-            for _ in range(circuit.shots.total_shots):
-                state.reset_state()
-                mid_measurements = {}
-                final_state = state.get_final_state(
-                    aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode
-                )
-                results.append(
-                    LightningGPUMeasurements(final_state).measure_final_state(
-                        aux_circ, mid_measurements=mid_measurements
-                    )
-                )
-            return tuple(results)
-
+            raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
+            
         state.reset_state()
         final_state = state.get_final_state(circuit)
         return LightningGPUMeasurements(final_state).measure_final_state(circuit)

From 1729d0665f6011257d14e3f48d58441c4aedb947 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Thu, 12 Sep 2024 16:04:45 +0000
Subject: [PATCH 24/41] apply format

---
 pennylane_lightning/lightning_gpu/lightning_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 41fd1d097f..0977bfee28 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -405,7 +405,7 @@ def simulate(
         """
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
             raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
-            
+
         state.reset_state()
         final_state = state.get_final_state(circuit)
         return LightningGPUMeasurements(final_state).measure_final_state(circuit)

From 5819efc2374a862ec687a4f4f8c4fb39f41c8521 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Fri, 13 Sep 2024 11:27:54 -0400
Subject: [PATCH 25/41] Apply suggestions from code review. Vincent's
 suggestion

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
---
 pennylane_lightning/lightning_gpu/_mpi_handler.py     | 2 +-
 pennylane_lightning/lightning_gpu/lightning_gpu.py    | 4 ++--
 pennylane_lightning/lightning_kokkos/_measurements.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index 23656ee518..9b07f8eec5 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -41,7 +41,7 @@ class LightningGPU_MPIHandler:
         mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
         dev_pool (Callable): Method to handle the GPU devices available.
         num_wires (int): the number of wires to initialize the device with.
-        c_dtype (np.complex64, np.complex128): Datatypes for statevector representation
+        c_dtype (np.complex64, np.complex128): Datatypes for statevector representation.
     """
 
     def __init__(
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 0977bfee28..aa3b418428 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -214,7 +214,7 @@ def check_gpu_resources() -> None:
     if find_library("custatevec") is None and not imp_util.find_spec("cuquantum"):
 
         raise ImportError(
-            "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
+            "cuStateVec libraries not found. Please pip install the appropriate cuStateVec library in a virtual environment."
         )
 
     if not DevPool.getTotalDevices():
@@ -247,7 +247,7 @@ class LightningGPU(LightningBase):
             is built with MPI. Default is False.
         mpi (bool): declare if the device will use the MPI support.
         mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
-        sync (bool): immediately sync with host-sv after applying operation
+        sync (bool): immediately sync with host-sv after applying operation.
     """
 
     # General device options
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index 86c59595b7..866ac3214f 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -120,7 +120,7 @@ def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
         Args:
-            measurementprocess (StateMeasurement): measurement to apply to the state
+            measurementprocess (StateMeasurement): measurement to apply to the state.
 
         Returns:
             Probabilities of the supplied observable or wires

From 2dbc7dbc3b645f4b7bd639f70779e01ea533cb90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Fri, 13 Sep 2024 15:31:37 +0000
Subject: [PATCH 26/41] review comments

---
 .../lightning_gpu/_mpi_handler.py             | 19 +++++++++++--------
 .../lightning_gpu/_state_vector.py            | 18 +++++-------------
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index 9b07f8eec5..245081d64d 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -30,7 +30,7 @@
 
 # MPI options
 class LightningGPU_MPIHandler:
-    """MPI handler for PennyLane Lightning GPU device
+    """MPI handler for PennyLane Lightning GPU device.
 
     MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
 
@@ -76,19 +76,22 @@ def __init__(
             # set the number of global and local wires
             commSize = self._mpi_manager.getSize()
             self.num_global_wires = commSize.bit_length() - 1
-            self.num_local_wires = num_wires - self._num_global_wires
+            self.num_local_wires = num_wires - self.num_global_wires
 
-            # Memory size in bytes
-            sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
-            if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
-                raise ValueError("The MPI buffer size is larger than the local state vector size.")
+            self._check_memory_size(c_dtype, mpi_buf_size)
 
         if not self.use_mpi:
             self.num_local_wires = num_wires
             self.num_global_wires = num_wires
 
-    def _mebibytesToBytes(mebibytes):
+    def _mebibytesToBytes(self,mebibytes):
         return mebibytes * 1024 * 1024
+    
+    def _check_memory_size(self,c_dtype, mpi_buf_size):
+        # Memory size in bytes
+        sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
+        if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
+            raise ValueError("The MPI buffer size is larger than the local state vector size.")
 
     def _mpi_init_helper(self, num_wires):
         """Set up MPI checks and initializations."""
@@ -112,7 +115,7 @@ def _mpi_init_helper(self, num_wires):
             )
 
         # set GPU device
-        rank = self._mpi_manager.getRank()
+        rank = mpi_manager.getRank()
         deviceid = rank % numProcsNode
         self._dp.setDeviceID(deviceid)
         devtag = DevTag(deviceid)
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 106ae80b48..4c591b24c4 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -68,6 +68,8 @@ class LightningGPUStateVector(LightningBaseStateVector):
         dtype: Datatypes for state-vector representation. Must be one of
             ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
         device_name(string): state vector device name. Options: ["lightning.gpu"]
+        mpi_handler(LightningGPU_MPIHandler): MPI handler for PennyLane Lightning GPU device. Provides functionality to run on multiple devices. 
+        sync (bool): immediately sync with host-sv after applying operation.
     """
 
     def __init__(
@@ -93,7 +95,7 @@ def __init__(
         self._sync = sync
 
         # Initialize the state vector
-        if self._mpi_handler.use_mpi:
+        if self._mpi_handler.use_mpi: # using MPI
             self._qubit_state = self._state_dtype()(
                 self._mpi_handler.mpi_manager,
                 self._mpi_handler.devtag,
@@ -101,8 +103,7 @@ def __init__(
                 self._mpi_handler.num_global_wires,
                 self._mpi_handler.num_local_wires,
             )
-
-        if not self._mpi_handler.use_mpi:
+        else: # without MPI
             self._qubit_state = self._state_dtype()(self.num_wires)
 
         self._create_basis_state(0)
@@ -227,8 +228,6 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
                 return
             local_state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
             self._mpi_handler.mpi_manager.Scatter(state, local_state, 0)
-            # Initialize the entire device state with the input state
-            # self.syncH2D(self._reshape(local_state, output_shape))
             self.syncH2D(np.reshape(local_state, output_shape))
             return
 
@@ -337,14 +336,7 @@ def _apply_lightning(
             method = getattr(state, name, None)
             wires = list(operation.wires)
 
-            if isinstance(operation, Conditional):
-                if operation.meas_val.concretize(mid_measurements):
-                    self._apply_lightning([operation.base])
-            elif isinstance(operation, MidMeasureMP):
-                self._apply_lightning_midmeasure(
-                    operation, mid_measurements, postselect_mode=postselect_mode
-                )
-            elif method is not None:  # apply specialized gate
+            if method is not None:  # apply specialized gate
                 param = operation.parameters
                 method(wires, invert_param, param)
             elif isinstance(operation, qml.ops.Controlled) and isinstance(

From 0630edfeed7fdca4884d17a1463e0c5007488ab7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Fri, 13 Sep 2024 15:40:07 +0000
Subject: [PATCH 27/41] apply format

---
 pennylane_lightning/lightning_gpu/_mpi_handler.py  | 6 +++---
 pennylane_lightning/lightning_gpu/_state_vector.py | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index 245081d64d..ab7cccc689 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -84,10 +84,10 @@ def __init__(
             self.num_local_wires = num_wires
             self.num_global_wires = num_wires
 
-    def _mebibytesToBytes(self,mebibytes):
+    def _mebibytesToBytes(self, mebibytes):
         return mebibytes * 1024 * 1024
-    
-    def _check_memory_size(self,c_dtype, mpi_buf_size):
+
+    def _check_memory_size(self, c_dtype, mpi_buf_size):
         # Memory size in bytes
         sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
         if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 4c591b24c4..cac6f24f8e 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -68,7 +68,8 @@ class LightningGPUStateVector(LightningBaseStateVector):
         dtype: Datatypes for state-vector representation. Must be one of
             ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
         device_name(string): state vector device name. Options: ["lightning.gpu"]
-        mpi_handler(LightningGPU_MPIHandler): MPI handler for PennyLane Lightning GPU device. Provides functionality to run on multiple devices. 
+        mpi_handler(LightningGPU_MPIHandler): MPI handler for PennyLane Lightning GPU device. 
+            Provides functionality to run on multiple devices.
         sync (bool): immediately sync with host-sv after applying operation.
     """
 
@@ -95,7 +96,7 @@ def __init__(
         self._sync = sync
 
         # Initialize the state vector
-        if self._mpi_handler.use_mpi: # using MPI
+        if self._mpi_handler.use_mpi:  # using MPI
             self._qubit_state = self._state_dtype()(
                 self._mpi_handler.mpi_manager,
                 self._mpi_handler.devtag,
@@ -103,7 +104,7 @@ def __init__(
                 self._mpi_handler.num_global_wires,
                 self._mpi_handler.num_local_wires,
             )
-        else: # without MPI
+        else:  # without MPI
             self._qubit_state = self._state_dtype()(self.num_wires)
 
         self._create_basis_state(0)

From 3fa840921bfe28fb9accd888906272a498888280 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Fri, 13 Sep 2024 15:52:30 +0000
Subject: [PATCH 28/41] apply format

---
 pennylane_lightning/lightning_gpu/_state_vector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index cac6f24f8e..45dc3768ee 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -68,7 +68,7 @@ class LightningGPUStateVector(LightningBaseStateVector):
         dtype: Datatypes for state-vector representation. Must be one of
             ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
         device_name(string): state vector device name. Options: ["lightning.gpu"]
-        mpi_handler(LightningGPU_MPIHandler): MPI handler for PennyLane Lightning GPU device. 
+        mpi_handler(LightningGPU_MPIHandler): MPI handler for PennyLane Lightning GPU device.
             Provides functionality to run on multiple devices.
         sync (bool): immediately sync with host-sv after applying operation.
     """

From af16b8d4304c38a066bd3493fe1ba2293cc79abf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Fri, 13 Sep 2024 22:12:49 +0000
Subject: [PATCH 29/41] Ali suggestion 1

---
 .../core/_measurements_base.py                | 20 +++++++++-
 .../lightning_gpu/bindings/LGPUBindings.hpp   |  2 +-
 .../bindings/LGPUBindingsMPI.hpp              |  2 +-
 .../lightning_gpu/_measurements.py            | 39 +++++++------------
 .../lightning_gpu/_state_vector.py            |  5 ++-
 .../lightning_kokkos/_measurements.py         | 32 +++++----------
 .../lightning_qubit/_measurements.py          | 32 +++++----------
 7 files changed, 58 insertions(+), 74 deletions(-)

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index 51538cd5ec..4e463cb83d 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -131,15 +131,31 @@ def expval(self, measurementprocess: MeasurementProcess):
         )
 
     @abstractmethod
+    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
+        """Convert the data structure from the C++ backend to a common structure through lightning devices."""
+
     def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
         Args:
-            measurementprocess (StateMeasurement): measurement to apply to the state
+            measurementprocess (StateMeasurement): measurement to apply to the state.
 
         Returns:
-            Probabilities of the supplied observable or wires
+            Probabilities of the supplied observable or wires.
         """
+        diagonalizing_gates = measurementprocess.diagonalizing_gates()
+
+        if diagonalizing_gates:
+            self._qubit_state.apply_operations(diagonalizing_gates)
+
+        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+
+        if diagonalizing_gates:
+            self._qubit_state.apply_operations(
+                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
+            )
+
+        return self._probs_retval_conversion(results) 
 
     def var(self, measurementprocess: MeasurementProcess):
         """Variance of the supplied observable contained in the MeasurementProcess.
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index 5bd92b5520..a1ae68e8a8 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -152,7 +152,7 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
              "Get the GPU index for the statevector data.")
         .def("numQubits", &StateVectorT::getNumQubits)
         .def("dataLength", &StateVectorT::getLength)
-        .def("resetGPU", &StateVectorT::initSV)
+        .def("resetStateVector", &StateVectorT::initSV)
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 620fd93868..e9f8b762d3 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -155,7 +155,7 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
              "Get the GPU index for the statevector data.")
         .def("numQubits", &StateVectorT::getNumQubits)
         .def("dataLength", &StateVectorT::getLength)
-        .def("resetGPU", &StateVectorT::initSV)
+        .def("resetStateVector", &StateVectorT::initSV)
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 0417837cc0..206fd8a630 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -34,7 +34,7 @@
 
     pass
 
-from typing import List
+from typing import Any, List
 
 import numpy as np
 import pennylane as qml
@@ -126,31 +126,20 @@ def _process_single_shot(samples):
         return (
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
-
-    def probs(self, measurementprocess: MeasurementProcess):
-        """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
-
-        Args:
-            measurementprocess (StateMeasurement): measurement to apply to the state
-
-        Returns:
-            Probabilities of the supplied observable or wires
+        
+    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
+        """Convert the data structure from the C++ backend to a common structure through lightning devices.
+        
+        Args: 
+            probs_result (Any): Result provided by C++ backend.
+            
+        Returns: 
+            np.ndarray with probabilities of the supplied observable or wires.
         """
-        diagonalizing_gates = measurementprocess.diagonalizing_gates()
-
-        if diagonalizing_gates:
-            self._qubit_state.apply_operations(diagonalizing_gates)
-
-        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
-
-        if diagonalizing_gates:
-            self._qubit_state.apply_operations(
-                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
-            )
 
         # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
-        if len(results) > 0:
-            num_local_wires = len(results).bit_length() - 1 if len(results) > 0 else 0
-            return results.reshape([2] * num_local_wires).transpose().reshape(-1)
+        if len(probs_results) > 0:
+            num_local_wires = len(probs_results).bit_length() - 1 if len(probs_results) > 0 else 0
+            return probs_results.reshape([2] * num_local_wires).transpose().reshape(-1)
 
-        return results
+        return probs_results
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 45dc3768ee..5c45745209 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -122,7 +122,7 @@ def _state_dtype(self):
     def reset_state(self):
         """Reset the device's state"""
         # init the state vector to |00..0>
-        self._qubit_state.resetGPU(False)  # Sync reset
+        self._qubit_state.resetStateVector(False)  # Sync reset
 
     def syncD2H(self, state_vector, use_async=False):
         """Copy the state vector data on device to a state vector on the host provided by the user.
@@ -215,6 +215,9 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
             raise DeviceError("LightningGPU does not support allocate external state_vector.")
 
             # TODO
+            # Create an implementation in the C++ backend and binding to be able 
+            # to allocate memory for a new statevector and copy the data 
+            # from an external state vector.
             # state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
             # state.getState(state_data)
             # state = state_data
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index 866ac3214f..195944b2a4 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -21,7 +21,7 @@
 except ImportError:
     pass
 
-from typing import List
+from typing import Any, List
 
 import numpy as np
 import pennylane as qml
@@ -116,25 +116,13 @@ def _process_single_shot(samples):
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
 
-    def probs(self, measurementprocess: MeasurementProcess):
-        """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
-
-        Args:
-            measurementprocess (StateMeasurement): measurement to apply to the state.
-
-        Returns:
-            Probabilities of the supplied observable or wires
+    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
+        """Convert the data structure from the C++ backend to a common structure through lightning devices.
+        
+        Args: 
+            probs_result (Any): Result provided by C++ backend.
+            
+        Returns: 
+            np.ndarray with probabilities of the supplied observable or wires.
         """
-        diagonalizing_gates = measurementprocess.diagonalizing_gates()
-
-        if diagonalizing_gates:
-            self._qubit_state.apply_operations(diagonalizing_gates)
-
-        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
-
-        if diagonalizing_gates:
-            self._qubit_state.apply_operations(
-                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
-            )
-
-        return results
+        return probs_results
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index 6958f5b1e5..d712040a91 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -22,7 +22,7 @@
     pass
 
 from functools import reduce
-from typing import List
+from typing import Any, List
 
 import numpy as np
 import pennylane as qml
@@ -142,25 +142,13 @@ def _process_single_shot(samples):
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
 
-    def probs(self, measurementprocess: MeasurementProcess):
-        """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
-
-        Args:
-            measurementprocess (StateMeasurement): measurement to apply to the state
-
-        Returns:
-            Probabilities of the supplied observable or wires
+    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
+        """Convert the data structure from the C++ backend to a common structure through lightning devices.
+        
+        Args: 
+            probs_result (Any): Result provided by C++ backend.
+            
+        Returns: 
+            np.ndarray with probabilities of the supplied observable or wires.
         """
-        diagonalizing_gates = measurementprocess.diagonalizing_gates()
-
-        if diagonalizing_gates:
-            self._qubit_state.apply_operations(diagonalizing_gates)
-
-        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
-
-        if diagonalizing_gates:
-            self._qubit_state.apply_operations(
-                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
-            )
-
-        return results
+        return probs_results

From 0cb050fd3b39e70f2086ebf7cf8839923e2577b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 16:06:46 +0000
Subject: [PATCH 30/41] add reset

---
 pennylane_lightning/core/_state_vector_base.py     | 7 +++++--
 pennylane_lightning/lightning_gpu/lightning_gpu.py | 2 +-
 tests/lightning_qubit/test_state_vector_class.py   | 5 ++++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index b2ba3a0669..31589357a2 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -96,10 +96,13 @@ def _state_dtype(self):
         Returns: the state vector class
         """
 
-    def reset_state(self):
+    def reset_state(self, sync=None):
         """Reset the device's state"""
         # init the state vector to |00..0>
-        self._qubit_state.resetStateVector()
+        if sync == None:
+            self._qubit_state.resetStateVector()
+        else:
+            self._qubit_state.resetStateVector(sync)
 
     @abstractmethod
     def _apply_state_vector(self, state, device_wires: Wires):
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index aa3b418428..07a74733a3 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -406,6 +406,6 @@ def simulate(
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
             raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
 
-        state.reset_state()
+        state.reset_state(sync=False)
         final_state = state.get_final_state(circuit)
         return LightningGPUMeasurements(final_state).measure_final_state(circuit)
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index b1bcdf1de1..78df38c2a0 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -155,7 +155,10 @@ def test_reset_state(tol, operation, par):
     state_vector = LightningStateVector(wires)
     state_vector.apply_operations([operation(np.array(par), Wires(range(wires)))])
 
-    state_vector.reset_state()
+    if device_name == 'lightning.gpu':
+        state_vector.reset_state(sync=False)
+    else:
+        state_vector.reset_state()
 
     expected_output = np.array([1, 0, 0, 0], dtype=state_vector.dtype)
     assert np.allclose(state_vector.state, expected_output, atol=tol, rtol=0)

From 54afeb549085fc59d50df32627381b6ba6813d8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 19:26:32 +0000
Subject: [PATCH 31/41] apply_basis_state as abstract in GPU

---
 .../core/_state_vector_base.py                | 21 +++++++----
 .../lightning_gpu/StateVectorCudaManaged.hpp  | 35 +++++++++++++++++++
 .../lightning_gpu/bindings/LGPUBindings.hpp   | 17 ++++++---
 .../lightning_gpu/_state_vector.py            | 30 ++--------------
 4 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index 31589357a2..6b73d0100d 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -16,7 +16,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Union
+from typing import Union, Optional
 
 import numpy as np
 from pennylane import BasisState, StatePrep
@@ -35,9 +35,10 @@ class LightningBaseStateVector(ABC):
         num_wires(int): the number of wires to initialize the device with
         dtype: Datatypes for state-vector representation. Must be one of
             ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
+        sync Optional(bool): immediately sync with host-sv after applying operation.
     """
 
-    def __init__(self, num_wires: int, dtype: Union[np.complex128, np.complex64]):
+    def __init__(self, num_wires: int, dtype: Union[np.complex128, np.complex64], sync: Optional[bool] = None):
 
         if dtype not in [np.complex64, np.complex128]:
             raise TypeError(f"Unsupported complex type: {dtype}")
@@ -45,6 +46,7 @@ def __init__(self, num_wires: int, dtype: Union[np.complex128, np.complex64]):
         self._num_wires = num_wires
         self._wires = Wires(range(num_wires))
         self._dtype = dtype
+        self._sync = sync
 
         # Dummy for the device name
         self._device_name = None
@@ -96,7 +98,7 @@ def _state_dtype(self):
         Returns: the state vector class
         """
 
-    def reset_state(self, sync=None):
+    def reset_state(self, sync:Optional[bool] =None):
         """Reset the device's state"""
         # init the state vector to |00..0>
         if sync == None:
@@ -105,7 +107,7 @@ def reset_state(self, sync=None):
             self._qubit_state.resetStateVector(sync)
 
     @abstractmethod
-    def _apply_state_vector(self, state, device_wires: Wires):
+    def _apply_state_vector(self, state, device_wires: Wires, sync: Optional[bool] = None):
         """Initialize the internal state vector in a specified state.
         Args:
             state (array[complex]): normalized input state of length ``2**len(wires)``
@@ -113,7 +115,7 @@ def _apply_state_vector(self, state, device_wires: Wires):
             device_wires (Wires): wires that get initialized in the state
         """
 
-    def _apply_basis_state(self, state, wires):
+    def _apply_basis_state(self, state, wires, use_async:Optional[bool] = None):
         """Initialize the state vector in a specified computational basis state.
 
         Args:
@@ -121,6 +123,7 @@ def _apply_basis_state(self, state, wires):
                 consisting of 0s and 1s.
             wires (Wires): wires that the provided computational state should be
                 initialized on
+            use_async(Optional[bool]): immediately sync with host-sv after applying operation.
 
         Note: This function does not support broadcasted inputs yet.
         """
@@ -131,7 +134,11 @@ def _apply_basis_state(self, state, wires):
             raise ValueError("BasisState parameter and wires must be of equal length.")
 
         # Return a computational basis state over all wires.
-        self._qubit_state.setBasisState(list(state), list(wires))
+        if use_async == None:
+            self._qubit_state.setBasisState(list(state), list(wires))
+        else:
+            self._qubit_state.setBasisState(list(state), list(wires), use_async)
+            
 
     @abstractmethod
     def _apply_lightning_controlled(self, operation):
@@ -188,7 +195,7 @@ def apply_operations(
                 self._apply_state_vector(operations[0].parameters[0].copy(), operations[0].wires)
                 operations = operations[1:]
             elif isinstance(operations[0], BasisState):
-                self._apply_basis_state(operations[0].parameters[0], operations[0].wires)
+                self._apply_basis_state(operations[0].parameters[0], operations[0].wires, self._sync)
                 operations = operations[1:]
         self._apply_lightning(
             operations, mid_measurements=mid_measurements, postselect_mode=postselect_mode
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 716d95c89f..ef05b4aa7c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -184,6 +184,41 @@ class StateVectorCudaManaged
                            stream_id);
     }
 
+    /**
+     * @brief Prepares a single computational basis state.
+     *
+     * @param state Binary number representing the index
+     * @param wires Wires.
+     * @param 
+     */
+    void setBasisState(const std::vector<std::size_t> &state,
+                       const std::vector<std::size_t> &wires,
+                       const bool use_async) {
+        PL_ABORT_IF_NOT(state.size() == wires.size(),
+                        "state and wires must have equal dimensions.");
+        const auto num_qubits = BaseType::getNumQubits();
+        PL_ABORT_IF_NOT(
+            std::find_if(wires.begin(), wires.end(),
+                         [&num_qubits](const auto i) {
+                             return i >= num_qubits;
+                         }) == wires.end(),
+            "wires must take values lower than the number of qubits.");
+        const auto n_wires = wires.size();
+        std::size_t index{0U};
+        for (std::size_t k = 0; k < n_wires; k++) {
+            const auto bit = static_cast<std::size_t>(state[k]);
+            index |= bit << (num_qubits - 1 - wires[k]);
+        }
+
+        BaseType::getDataBuffer().zeroInit();
+        const std::complex<PrecisionT> value(1, 0);
+        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+        setBasisState_CUDA(BaseType::getData(), value_cu, index, use_async,
+                           stream_id);
+    }
+
+
     /**
      * @brief Set values for a batch of elements of the state-vector. This
      * method is implemented by the customized CUDA kernel defined in the
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index a1ae68e8a8..6091794aee 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -81,14 +81,23 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
             return new StateVectorT(data_ptr,
                                     static_cast<std::size_t>(arr.size()));
         }))
+        .def(
+            "setBasisStateZero",
+            [](StateVectorT &sv, const bool use_async) {
+                const std::complex<PrecisionT> value(1, 0);
+                std::size_t zero{0U};
+                sv.setBasisState(value, zero, use_async);
+            },
+            "Create Basis State to zero on GPU.")
         .def(
             "setBasisState",
-            [](StateVectorT &sv, const std::size_t index,
+            [](StateVectorT &sv, const std::vector<std::size_t> &state,
+               const std::vector<std::size_t> &wires,
                const bool use_async) {
-                const std::complex<PrecisionT> value(1, 0);
-                sv.setBasisState(value, index, use_async);
+                sv.setBasisState(state, wires, use_async);
             },
-            "Create Basis State on GPU.")
+            "Set the state vector to a basis state on GPU.")
+
         .def(
             "setStateVector",
             [](StateVectorT &sv, const np_arr_sparse_ind &indices,
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 5c45745209..a2b6aa2085 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -82,7 +82,7 @@ def __init__(
         sync=True,
     ):
 
-        super().__init__(num_wires, dtype)
+        super().__init__(num_wires, dtype, sync=sync)
 
         self._device_name = device_name
 
@@ -107,7 +107,8 @@ def __init__(
         else:  # without MPI
             self._qubit_state = self._state_dtype()(self.num_wires)
 
-        self._create_basis_state(0)
+        use_async = False
+        self._qubit_state.setBasisStateZero(use_async)
 
     def _state_dtype(self):
         """Binding to Lightning Managed state vector C++ class.
@@ -119,10 +120,6 @@ def _state_dtype(self):
         else:
             return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64
 
-    def reset_state(self):
-        """Reset the device's state"""
-        # init the state vector to |00..0>
-        self._qubit_state.resetStateVector(False)  # Sync reset
 
     def syncD2H(self, state_vector, use_async=False):
         """Copy the state vector data on device to a state vector on the host provided by the user.
@@ -250,27 +247,6 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
             ravelled_indices, state, use_async
         )  # this operation on device
 
-    def _apply_basis_state(self, state, wires):
-        """Initialize the state vector in a specified computational basis state on GPU directly.
-            Args:
-            state (array[int]): computational basis state (on host) of shape ``(wires,)``
-                consisting of 0s and 1s.
-            wires (Wires): wires that the provided computational state should be initialized on
-        Note: This function does not support broadcasted inputs yet.
-        """
-        if not set(state.tolist()).issubset({0, 1}):
-            raise ValueError("BasisState parameter must consist of 0 or 1 integers.")
-
-        if len(state) != len(wires):
-            raise ValueError("BasisState parameter and wires must be of equal length.")
-
-        # get computational basis state number
-        basis_states = 1 << (self.num_wires - 1 - np.array(wires))
-        basis_states = qml.math.convert_like(basis_states, state)
-        num = int(qml.math.dot(state, basis_states))
-
-        self._create_basis_state(num)
-
     def _apply_lightning_controlled(self, operation):
         """Apply an arbitrary controlled operation to the state tensor.
 

From ac87663e020d93d36223110f65177c9876e07738 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 20:11:59 +0000
Subject: [PATCH 32/41] apply format

---
 pennylane_lightning/core/_measurements_base.py    |  2 +-
 pennylane_lightning/core/_state_vector_base.py    | 15 +++++++++------
 .../lightning_gpu/StateVectorCudaManaged.hpp      |  3 +--
 .../lightning_gpu/bindings/LGPUBindings.hpp       |  3 +--
 .../lightning_gpu/_measurements.py                | 10 +++++-----
 .../lightning_gpu/_state_vector.py                |  5 ++---
 .../lightning_kokkos/_measurements.py             |  8 ++++----
 .../lightning_qubit/_measurements.py              |  8 ++++----
 tests/lightning_qubit/test_state_vector_class.py  |  2 +-
 9 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index 4e463cb83d..c979878a9b 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -155,7 +155,7 @@ def probs(self, measurementprocess: MeasurementProcess):
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
 
-        return self._probs_retval_conversion(results) 
+        return self._probs_retval_conversion(results)
 
     def var(self, measurementprocess: MeasurementProcess):
         """Variance of the supplied observable contained in the MeasurementProcess.
diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index 6b73d0100d..b35d4d6e42 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -16,7 +16,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Union, Optional
+from typing import Optional, Union
 
 import numpy as np
 from pennylane import BasisState, StatePrep
@@ -38,7 +38,9 @@ class LightningBaseStateVector(ABC):
         sync Optional(bool): immediately sync with host-sv after applying operation.
     """
 
-    def __init__(self, num_wires: int, dtype: Union[np.complex128, np.complex64], sync: Optional[bool] = None):
+    def __init__(
+        self, num_wires: int, dtype: Union[np.complex128, np.complex64], sync: Optional[bool] = None
+    ):
 
         if dtype not in [np.complex64, np.complex128]:
             raise TypeError(f"Unsupported complex type: {dtype}")
@@ -98,7 +100,7 @@ def _state_dtype(self):
         Returns: the state vector class
         """
 
-    def reset_state(self, sync:Optional[bool] =None):
+    def reset_state(self, sync: Optional[bool] = None):
         """Reset the device's state"""
         # init the state vector to |00..0>
         if sync == None:
@@ -115,7 +117,7 @@ def _apply_state_vector(self, state, device_wires: Wires, sync: Optional[bool] =
             device_wires (Wires): wires that get initialized in the state
         """
 
-    def _apply_basis_state(self, state, wires, use_async:Optional[bool] = None):
+    def _apply_basis_state(self, state, wires, use_async: Optional[bool] = None):
         """Initialize the state vector in a specified computational basis state.
 
         Args:
@@ -138,7 +140,6 @@ def _apply_basis_state(self, state, wires, use_async:Optional[bool] = None):
             self._qubit_state.setBasisState(list(state), list(wires))
         else:
             self._qubit_state.setBasisState(list(state), list(wires), use_async)
-            
 
     @abstractmethod
     def _apply_lightning_controlled(self, operation):
@@ -195,7 +196,9 @@ def apply_operations(
                 self._apply_state_vector(operations[0].parameters[0].copy(), operations[0].wires)
                 operations = operations[1:]
             elif isinstance(operations[0], BasisState):
-                self._apply_basis_state(operations[0].parameters[0], operations[0].wires, self._sync)
+                self._apply_basis_state(
+                    operations[0].parameters[0], operations[0].wires, self._sync
+                )
                 operations = operations[1:]
         self._apply_lightning(
             operations, mid_measurements=mid_measurements, postselect_mode=postselect_mode
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index ef05b4aa7c..808a47f60f 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -189,7 +189,7 @@ class StateVectorCudaManaged
      *
      * @param state Binary number representing the index
      * @param wires Wires.
-     * @param 
+     * @param
      */
     void setBasisState(const std::vector<std::size_t> &state,
                        const std::vector<std::size_t> &wires,
@@ -218,7 +218,6 @@ class StateVectorCudaManaged
                            stream_id);
     }
 
-
     /**
      * @brief Set values for a batch of elements of the state-vector. This
      * method is implemented by the customized CUDA kernel defined in the
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index 6091794aee..3c44179702 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -92,8 +92,7 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
         .def(
             "setBasisState",
             [](StateVectorT &sv, const std::vector<std::size_t> &state,
-               const std::vector<std::size_t> &wires,
-               const bool use_async) {
+               const std::vector<std::size_t> &wires, const bool use_async) {
                 sv.setBasisState(state, wires, use_async);
             },
             "Set the state vector to a basis state on GPU.")
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 206fd8a630..961579ac84 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -126,14 +126,14 @@ def _process_single_shot(samples):
         return (
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
-        
+
     def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
         """Convert the data structure from the C++ backend to a common structure through lightning devices.
-        
-        Args: 
+
+        Args:
             probs_result (Any): Result provided by C++ backend.
-            
-        Returns: 
+
+        Returns:
             np.ndarray with probabilities of the supplied observable or wires.
         """
 
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index a2b6aa2085..1e0f1c1839 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -120,7 +120,6 @@ def _state_dtype(self):
         else:
             return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64
 
-
     def syncD2H(self, state_vector, use_async=False):
         """Copy the state vector data on device to a state vector on the host provided by the user.
         Args:
@@ -212,8 +211,8 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
             raise DeviceError("LightningGPU does not support allocate external state_vector.")
 
             # TODO
-            # Create an implementation in the C++ backend and binding to be able 
-            # to allocate memory for a new statevector and copy the data 
+            # Create an implementation in the C++ backend and binding to be able
+            # to allocate memory for a new statevector and copy the data
             # from an external state vector.
             # state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
             # state.getState(state_data)
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index 195944b2a4..41a321b673 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -118,11 +118,11 @@ def _process_single_shot(samples):
 
     def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
         """Convert the data structure from the C++ backend to a common structure through lightning devices.
-        
-        Args: 
+
+        Args:
             probs_result (Any): Result provided by C++ backend.
-            
-        Returns: 
+
+        Returns:
             np.ndarray with probabilities of the supplied observable or wires.
         """
         return probs_results
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index d712040a91..687b141e72 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -144,11 +144,11 @@ def _process_single_shot(samples):
 
     def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
         """Convert the data structure from the C++ backend to a common structure through lightning devices.
-        
-        Args: 
+
+        Args:
             probs_result (Any): Result provided by C++ backend.
-            
-        Returns: 
+
+        Returns:
             np.ndarray with probabilities of the supplied observable or wires.
         """
         return probs_results
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index 78df38c2a0..cac3f067af 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -155,7 +155,7 @@ def test_reset_state(tol, operation, par):
     state_vector = LightningStateVector(wires)
     state_vector.apply_operations([operation(np.array(par), Wires(range(wires)))])
 
-    if device_name == 'lightning.gpu':
+    if device_name == "lightning.gpu":
         state_vector.reset_state(sync=False)
     else:
         state_vector.reset_state()

From 35270fb77cca8ee2c159640457995e4a6e62c188 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 16:16:04 -0400
Subject: [PATCH 33/41] Apply suggestions from code review Ali suggestion docs

Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
---
 pennylane_lightning/lightning_gpu/_measurements.py | 6 +++---
 pennylane_lightning/lightning_gpu/_mpi_handler.py  | 6 ++++--
 pennylane_lightning/lightning_gpu/_state_vector.py | 4 +++-
 pennylane_lightning/lightning_gpu/lightning_gpu.py | 2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 961579ac84..44bfcca60b 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -109,9 +109,9 @@ def _process_single_shot(samples):
                 len(wires), shots.total_shots
             ).astype(int, copy=False)
 
-        except ValueError as e:
-            if str(e) != "probabilities contain NaN":
-                raise e
+        except ValueError as ex:
+            if str(ex) != "probabilities contain NaN":
+                raise ex
             samples = qml.math.full((shots.total_shots, len(wires)), 0)
 
         self._apply_diagonalizing_gates(mps, adjoint=True)
diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index ab7cccc689..baadf1e312 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -15,12 +15,14 @@
 This module contains the :class:`~.LightningGPU_MPIHandler` class, a MPI handler to use LightningGPU device with multi-GPU on multi-node system.
 """
 
+from warnings import warn
 try:
     # pylint: disable=no-name-in-module
     from pennylane_lightning.lightning_gpu_ops import DevTag, MPIManager
 
     MPI_SUPPORT = True
-except ImportError:
+except ImportError as ex:
+    print(str(ex), UserWarning)
     MPI_SUPPORT = False
 
 from typing import Callable, Union
@@ -29,7 +31,7 @@
 
 
 # MPI options
-class LightningGPU_MPIHandler:
+class MPIHandler:
     """MPI handler for PennyLane Lightning GPU device.
 
     MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 1e0f1c1839..3ce244037c 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -128,6 +128,7 @@ def syncD2H(self, state_vector, use_async=False):
             Note: This function only supports synchronized memory copy.
 
         **Example**
+
         >>> dev = qml.device('lightning.gpu', wires=1)
         >>> dev.apply([qml.PauliX(wires=[0])])
         >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.C_DTYPE)
@@ -162,6 +163,7 @@ def syncH2D(self, state_vector, use_async=False):
             Note: This function only supports synchronized memory copy.
 
         **Example**
+
         >>> dev = qml.device('lightning.gpu', wires=3)
         >>> obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
         >>> obs1 = qml.Identity(1)
@@ -287,7 +289,7 @@ def _apply_lightning_midmeasure(
     def _apply_lightning(
         self, operations, mid_measurements: dict = None, postselect_mode: str = None
     ):
-        """Apply a list of operations to the state tensor.
+        """Apply a list of operations to the state vector.
 
         Args:
             operations (list[~pennylane.operation.Operation]): operations to apply
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 07a74733a3..3b3351f53f 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -41,7 +41,7 @@
 
 from ._adjoint_jacobian import LightningGPUAdjointJacobian
 from ._measurements import LightningGPUMeasurements
-from ._mpi_handler import LightningGPU_MPIHandler
+from ._mpi_handler import MPIHandler
 from ._state_vector import LightningGPUStateVector
 
 try:

From f51cbb91786a8f07e5069d0fd3584ac499582729 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 20:22:46 +0000
Subject: [PATCH 34/41] propagate namming suggestion

---
 pennylane_lightning/lightning_gpu/_mpi_handler.py  | 1 +
 pennylane_lightning/lightning_gpu/_state_vector.py | 6 +++---
 pennylane_lightning/lightning_gpu/lightning_gpu.py | 4 +---
 tests/lightning_qubit/test_state_vector_class.py   | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index baadf1e312..ca09b8c033 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -16,6 +16,7 @@
 """
 
 from warnings import warn
+
 try:
     # pylint: disable=no-name-in-module
     from pennylane_lightning.lightning_gpu_ops import DevTag, MPIManager
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 3ce244037c..530292614e 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -45,7 +45,7 @@
 from pennylane_lightning.core._serialize import global_phase_diagonal
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
 
-from ._mpi_handler import LightningGPU_MPIHandler
+from ._mpi_handler import MPIHandler
 
 gate_cache_needs_hash = (
     qml.BlockEncode,
@@ -68,7 +68,7 @@ class LightningGPUStateVector(LightningBaseStateVector):
         dtype: Datatypes for state-vector representation. Must be one of
             ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
         device_name(string): state vector device name. Options: ["lightning.gpu"]
-        mpi_handler(LightningGPU_MPIHandler): MPI handler for PennyLane Lightning GPU device.
+        mpi_handler(MPIHandler): MPI handler for PennyLane Lightning GPU device.
             Provides functionality to run on multiple devices.
         sync (bool): immediately sync with host-sv after applying operation.
     """
@@ -87,7 +87,7 @@ def __init__(
         self._device_name = device_name
 
         if mpi_handler is None:
-            mpi_handler = LightningGPU_MPIHandler(False, 0, None, num_wires, dtype)
+            mpi_handler = MPIHandler(False, 0, None, num_wires, dtype)
 
         self._num_global_wires = mpi_handler.num_global_wires
         self._num_local_wires = mpi_handler.num_local_wires
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 3b3351f53f..f43083af70 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -303,9 +303,7 @@ def __init__(  # pylint: disable=too-many-arguments
         self._sync = sync
 
         # Creating the state vector
-        self._mpi_handler = LightningGPU_MPIHandler(
-            mpi, mpi_buf_size, self._dp, len(self.wires), c_dtype
-        )
+        self._mpi_handler = MPIHandler(mpi, mpi_buf_size, self._dp, len(self.wires), c_dtype)
 
         self._statevector = self.LightningStateVector(
             num_wires=len(self.wires), dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index cac3f067af..05f99a9f1f 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -31,7 +31,7 @@
         pass
 
 if device_name == "lightning.gpu":
-    from pennylane_lightning.lightning_gpu._mpi_handler import LightningGPU_MPIHandler
+    from pennylane_lightning.lightning_gpu._mpi_handler import MPIHandler
 
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)

From 65e66e933b3ca278b388a09a54881dcd4bae20c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 17:54:20 -0400
Subject: [PATCH 35/41] Apply suggestions from code review. Ali's suggestion 3

Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
---
 pennylane_lightning/core/_measurements_base.py        |  9 +++++++--
 .../lightning_gpu/StateVectorCudaManaged.hpp          |  5 +++--
 pennylane_lightning/lightning_kokkos/_measurements.py | 11 -----------
 pennylane_lightning/lightning_qubit/_measurements.py  | 11 -----------
 4 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index c979878a9b..a7c50fdf75 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -132,8 +132,13 @@ def expval(self, measurementprocess: MeasurementProcess):
 
     @abstractmethod
     def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
-        """Convert the data structure from the C++ backend to a common structure through lightning devices."""
-
+        """Convert the data structure from the C++ backend to a common structure through lightning devices.
+        Args:
+            probs_result (Any): Result provided by C++ backend.
+        Returns:
+            np.ndarray with probabilities of the supplied observable or wires.
+        """
+        return probs_results;
     def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 808a47f60f..3e538482ca 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -185,11 +185,12 @@ class StateVectorCudaManaged
     }
 
     /**
-     * @brief Prepares a single computational basis state.
+     * @brief Prepare a single computational basis state.
      *
      * @param state Binary number representing the index
      * @param wires Wires.
-     * @param
+     * @param use_async(Optional[bool]): immediately sync with host-sv after applying operation.
+ 
      */
     void setBasisState(const std::vector<std::size_t> &state,
                        const std::vector<std::size_t> &wires,
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index 41a321b673..46260f7edb 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -115,14 +115,3 @@ def _process_single_shot(samples):
         return (
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
-
-    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
-        """Convert the data structure from the C++ backend to a common structure through lightning devices.
-
-        Args:
-            probs_result (Any): Result provided by C++ backend.
-
-        Returns:
-            np.ndarray with probabilities of the supplied observable or wires.
-        """
-        return probs_results
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index 687b141e72..71047e6d19 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -141,14 +141,3 @@ def _process_single_shot(samples):
         return (
             tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
         )
-
-    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
-        """Convert the data structure from the C++ backend to a common structure through lightning devices.
-
-        Args:
-            probs_result (Any): Result provided by C++ backend.
-
-        Returns:
-            np.ndarray with probabilities of the supplied observable or wires.
-        """
-        return probs_results

From 0472fdda2eed4d65f30e28cf3059eab2a0fd138e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 22:28:25 +0000
Subject: [PATCH 36/41] solve errors with kokkos

---
 pennylane_lightning/core/_measurements_base.py | 4 ++--
 pennylane_lightning/core/_state_vector_base.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index a7c50fdf75..781333560d 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -130,7 +130,6 @@ def expval(self, measurementprocess: MeasurementProcess):
             measurementprocess.obs.name, measurementprocess.obs.wires
         )
 
-    @abstractmethod
     def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
         """Convert the data structure from the C++ backend to a common structure through lightning devices.
         Args:
@@ -138,7 +137,8 @@ def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
         Returns:
             np.ndarray with probabilities of the supplied observable or wires.
         """
-        return probs_results;
+        return probs_results
+    
     def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index b35d4d6e42..a36c3979a4 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -48,7 +48,7 @@ def __init__(
         self._num_wires = num_wires
         self._wires = Wires(range(num_wires))
         self._dtype = dtype
-        self._sync = sync
+        self._base_sync = sync
 
         # Dummy for the device name
         self._device_name = None
@@ -136,6 +136,7 @@ def _apply_basis_state(self, state, wires, use_async: Optional[bool] = None):
             raise ValueError("BasisState parameter and wires must be of equal length.")
 
         # Return a computational basis state over all wires.
+        print("FSX:",use_async)
         if use_async == None:
             self._qubit_state.setBasisState(list(state), list(wires))
         else:
@@ -197,7 +198,7 @@ def apply_operations(
                 operations = operations[1:]
             elif isinstance(operations[0], BasisState):
                 self._apply_basis_state(
-                    operations[0].parameters[0], operations[0].wires, self._sync
+                    operations[0].parameters[0], operations[0].wires, self._base_sync
                 )
                 operations = operations[1:]
         self._apply_lightning(

From 96728cbb26666c702e77b179def58e3986ea8ba0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 22:29:02 +0000
Subject: [PATCH 37/41] apply format

---
 pennylane_lightning/core/_measurements_base.py               | 2 +-
 pennylane_lightning/core/_state_vector_base.py               | 2 +-
 .../src/simulators/lightning_gpu/StateVectorCudaManaged.hpp  | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index 781333560d..dbfb46e20a 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -138,7 +138,7 @@ def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
             np.ndarray with probabilities of the supplied observable or wires.
         """
         return probs_results
-    
+
     def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index a36c3979a4..bef158bc43 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -136,7 +136,7 @@ def _apply_basis_state(self, state, wires, use_async: Optional[bool] = None):
             raise ValueError("BasisState parameter and wires must be of equal length.")
 
         # Return a computational basis state over all wires.
-        print("FSX:",use_async)
+        print("FSX:", use_async)
         if use_async == None:
             self._qubit_state.setBasisState(list(state), list(wires))
         else:
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 3e538482ca..174d23aea2 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -189,8 +189,9 @@ class StateVectorCudaManaged
      *
      * @param state Binary number representing the index
      * @param wires Wires.
-     * @param use_async(Optional[bool]): immediately sync with host-sv after applying operation.
- 
+     * @param use_async(Optional[bool]): immediately sync with host-sv after
+     applying operation.
+
      */
     void setBasisState(const std::vector<std::size_t> &state,
                        const std::vector<std::size_t> &wires,

From 112ead0e1e73ee061d65a1d7f4ce915051514bbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 22:53:57 +0000
Subject: [PATCH 38/41] solve conflicts

---
 .../lightning_gpu/_adjoint_jacobian.py          |  3 ++-
 .../lightning_gpu/lightning_gpu.py              |  2 +-
 .../test_adjoint_jacobian_class.py              | 17 ++++++++++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
index 4130c1082b..89d86515b2 100644
--- a/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
+++ b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
@@ -38,8 +38,9 @@
         warn(str(ex), UserWarning)
         MPI_SUPPORT = False
 
-except (ImportError, ValueError) as ex:
+except ImportError as ex:
     warn(str(ex), UserWarning)
+    pass
 
 import numpy as np
 from pennylane.tape import QuantumTape
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 4c01ec555f..3be7390f61 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -55,7 +55,7 @@
     LGPU_CPP_BINARY_AVAILABLE = True
 
     try:
-        from ._mpi_handler import LightningGPU_MPIHandler
+        from ._mpi_handler import MPIHandler
 
         MPI_SUPPORT = True
     except ImportError as ex:
diff --git a/tests/lightning_qubit/test_adjoint_jacobian_class.py b/tests/lightning_qubit/test_adjoint_jacobian_class.py
index 236c697f1d..8f8f973754 100644
--- a/tests/lightning_qubit/test_adjoint_jacobian_class.py
+++ b/tests/lightning_qubit/test_adjoint_jacobian_class.py
@@ -420,7 +420,10 @@ def test_multiple_measurements(self, tol, lightning_sv):
         statevector = lightning_sv(num_wires=2)
         result_vjp = self.calculate_vjp(statevector, tape1, dy)
 
-        statevector.reset_state()
+        if device_name == "lightning.gpu":
+            statevector.reset_state(True)
+        else:
+            statevector.reset_state()
 
         result_jac = self.calculate_jacobian(statevector, tape2)
 
@@ -480,7 +483,11 @@ def test_hermitian_expectation(self, tol, lightning_sv):
                 qml.expval(qml.Hermitian(obs, wires=(0,)))
             tape.trainable_params = {0}
 
-            statevector.reset_state()
+            if device_name == "lightning.gpu":
+                statevector.reset_state(True)
+            else:
+                statevector.reset_state()
+
             vjp = self.calculate_vjp(statevector, tape, dy)
 
             assert np.allclose(vjp, -0.8 * np.sin(x), atol=tol)
@@ -497,7 +504,11 @@ def test_hermitian_tensor_expectation(self, tol, lightning_sv):
                 qml.expval(qml.Hermitian(obs, wires=(0,)) @ qml.PauliZ(wires=1))
             tape.trainable_params = {0}
 
-            statevector.reset_state()
+            if device_name == "lightning.gpu":
+                statevector.reset_state(True)
+            else:
+                statevector.reset_state()
+            
             vjp = self.calculate_vjp(statevector, tape, dy)
 
             assert np.allclose(vjp, -0.8 * np.sin(x), atol=tol)

From 87778d6e1d40d38b72faf384e50f43b882d7455f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 22:57:57 +0000
Subject: [PATCH 39/41] apply format

---
 tests/lightning_qubit/test_adjoint_jacobian_class.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lightning_qubit/test_adjoint_jacobian_class.py b/tests/lightning_qubit/test_adjoint_jacobian_class.py
index 8f8f973754..d5b9355bfb 100644
--- a/tests/lightning_qubit/test_adjoint_jacobian_class.py
+++ b/tests/lightning_qubit/test_adjoint_jacobian_class.py
@@ -508,7 +508,7 @@ def test_hermitian_tensor_expectation(self, tol, lightning_sv):
                 statevector.reset_state(True)
             else:
                 statevector.reset_state()
-            
+
             vjp = self.calculate_vjp(statevector, tape, dy)
 
             assert np.allclose(vjp, -0.8 * np.sin(x), atol=tol)

From c493d4766627cd7bfccd5daa8f25ae81baa6a40c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 23:09:16 +0000
Subject: [PATCH 40/41] solve issue with reset

---
 pennylane_lightning/core/lightning_newAPI_base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pennylane_lightning/core/lightning_newAPI_base.py b/pennylane_lightning/core/lightning_newAPI_base.py
index dcee73fd5c..ae3879bcd8 100644
--- a/pennylane_lightning/core/lightning_newAPI_base.py
+++ b/pennylane_lightning/core/lightning_newAPI_base.py
@@ -68,6 +68,7 @@ def __init__(  # pylint: disable=too-many-arguments
 
         self._c_dtype = c_dtype
         self._batch_obs = batch_obs
+        self._sync = None
 
         if isinstance(wires, int):
             self._wire_map = None  # should just use wires as is
@@ -133,7 +134,7 @@ def jacobian(
         """
         if wire_map is not None:
             [circuit], _ = qml.map_wires(circuit, wire_map)
-        state.reset_state()
+        state.reset_state(self._sync)
         final_state = state.get_final_state(circuit)
         return self.LightningAdjointJacobian(final_state, batch_obs=batch_obs).calculate_jacobian(
             circuit
@@ -191,7 +192,7 @@ def vjp(  # pylint: disable=too-many-arguments
         """
         if wire_map is not None:
             [circuit], _ = qml.map_wires(circuit, wire_map)
-        state.reset_state()
+        state.reset_state(self._sync)
         final_state = state.get_final_state(circuit)
         return self.LightningAdjointJacobian(final_state, batch_obs=batch_obs).calculate_vjp(
             circuit, cotangents

From faead9ca1b92f04f01a53c932741aa6c7273bfd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Tue, 17 Sep 2024 14:28:06 +0000
Subject: [PATCH 41/41] solve error with kokkos

---
 pennylane_lightning/core/_state_vector_base.py           | 1 -
 pennylane_lightning/lightning_kokkos/_state_vector.py    | 2 --
 pennylane_lightning/lightning_kokkos/lightning_kokkos.py | 4 +---
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index bef158bc43..834f8e175f 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -136,7 +136,6 @@ def _apply_basis_state(self, state, wires, use_async: Optional[bool] = None):
             raise ValueError("BasisState parameter and wires must be of equal length.")
 
         # Return a computational basis state over all wires.
-        print("FSX:", use_async)
         if use_async == None:
             self._qubit_state.setBasisState(list(state), list(wires))
         else:
diff --git a/pennylane_lightning/lightning_kokkos/_state_vector.py b/pennylane_lightning/lightning_kokkos/_state_vector.py
index 5e76249de0..b629a17dbe 100644
--- a/pennylane_lightning/lightning_kokkos/_state_vector.py
+++ b/pennylane_lightning/lightning_kokkos/_state_vector.py
@@ -62,7 +62,6 @@ def __init__(
         num_wires,
         dtype=np.complex128,
         kokkos_args=None,
-        sync=True,
     ):  # pylint: disable=too-many-arguments
 
         super().__init__(num_wires, dtype)
@@ -70,7 +69,6 @@ def __init__(
         self._device_name = "lightning.kokkos"
 
         self._kokkos_config = {}
-        self._sync = sync
 
         # Initialize the state vector
         if kokkos_args is None:
diff --git a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
index 4089b3aca7..6cf1d04618 100644
--- a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
+++ b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
@@ -302,7 +302,6 @@ def __init__(  # pylint: disable=too-many-arguments
         shots=None,
         batch_obs=False,
         # Kokkos arguments
-        sync=True,
         kokkos_args=None,
     ):
         if not self._CPP_BINARY_AVAILABLE:
@@ -324,11 +323,10 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # Kokkos specific options
         self._kokkos_args = kokkos_args
-        self._sync = sync
 
         # Creating the state vector
         self._statevector = self.LightningStateVector(
-            num_wires=len(self.wires), dtype=c_dtype, kokkos_args=kokkos_args, sync=sync
+            num_wires=len(self.wires), dtype=c_dtype, kokkos_args=kokkos_args
         )
 
         if not LightningKokkos.kokkos_config: