diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 484cf2ae29..1f4059f9e2 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add `mid-circuit measurements` support to `lightning.gpu`'s single-GPU backend.
+  [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931)
+
 * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users.
   [(#859)](https://github.com/PennyLaneAI/pennylane-lightning/pull/859)
 
@@ -21,8 +24,14 @@
 * Lightning-Kokkos migrated to the new device API.
   [(#810)](https://github.com/PennyLaneAI/pennylane-lightning/pull/810)
 
+* Lightning-GPU migrated to the new device API.
+  [(#853)](https://github.com/PennyLaneAI/pennylane-lightning/pull/853)
+
 ### Breaking changes
 
+* Deprecate `initSV()` and add `resetStateVector()` to `lightning.gpu`.
+  [(#933)](https://github.com/PennyLaneAI/pennylane-lightning/pull/933)
+
 * Deprecate PI gates implementation.
   [(#925)](https://github.com/PennyLaneAI/pennylane-lightning/pull/925)
 
@@ -37,6 +46,21 @@
 
 ### Improvements
 
+* Optimize the cartesian product to reduce the amount of memory necessary to set the StatePrep with LightningTensor. 
+  [(#943)](https://github.com/PennyLaneAI/pennylane-lightning/pull/943)
+
+* The `prob` data return `lightning.gpu` C++ layer is aligned with other state-vector backends and `lightning.gpu` supports out-of-order `qml.prob`.
+    [(#941)](https://github.com/PennyLaneAI/pennylane-lightning/pull/941)
+
+* Add `setStateVector(state, wire)` support to the `lightning.gpu` C++ layer.
+  [(#930)](https://github.com/PennyLaneAI/pennylane-lightning/pull/930)
+
+* Add zero-state initialization to both `StateVectorCudaManaged` and `StateVectorCudaMPI` constructors to remove the `reset_state` in the python layer ctor and refactor `setBasisState(state, wires)` in the C++ layer.
+  [(#933)](https://github.com/PennyLaneAI/pennylane-lightning/pull/933)
+  
+* The `generate_samples` methods of lightning.{qubit/kokkos} can now take in a seed number to make the generated samples deterministic. This can be useful when, among other things, fixing flaky tests in CI.
+  [(#927)](https://github.com/PennyLaneAI/pennylane-lightning/pull/927)
+
 * Always decompose `qml.QFT` in Lightning.
   [(#924)](https://github.com/PennyLaneAI/pennylane-lightning/pull/924)
 
@@ -95,6 +119,15 @@
 
 ### Bug fixes
 
+* Fix missing `liblightning_kokkos_catalyst.so` in Lightning-Kokkos editable installation.
+  [(#945)](https://github.com/PennyLaneAI/pennylane-lightning/pull/945)
+
+* Add concept restriction to ensure `ConstMult` inline function only hit with arithmetic-values times complex values. Fixes build failures with the test suite when enabling OpenMP, and disabling BLAS and Python under clang.
+  [(#936)](https://github.com/PennyLaneAI/pennylane-lightning/pull/936)
+
+* Bug fix for `applyMatrix` in `lightning.tensor`. Matrix operator data is not stored in the `cuGateCache` object to support `TensorProd` obs with multiple `Hermitian` obs.
+  [(#932)](https://github.com/PennyLaneAI/pennylane-lightning/pull/932)
+
 * Bug fix for `_pauli_word` of `QuantumScriptSerializer`. `_pauli_word` can process `PauliWord` object: `I`.
   [(#919)](https://github.com/PennyLaneAI/pennylane-lightning/pull/919)
 
@@ -105,7 +138,7 @@
 
 This release contains contributions from (in alphabetical order):
 
-Ali Asadi, Amintor Dusko, Luis Alfredo Nuñez Meneses, Vincent Michaud-Rioux, Lee J. O'Riordan, Mudit Pandey, Shuli Shu
+Ali Asadi, Amintor Dusko, Luis Alfredo Nuñez Meneses, Vincent Michaud-Rioux, Lee J. O'Riordan, Mudit Pandey, Shuli Shu, Haochen Paul Wang
 
 ---
 
diff --git a/.github/workflows/wheel_linux_aarch64.yml b/.github/workflows/wheel_linux_aarch64.yml
index 63bc629a77..bc21a56822 100644
--- a/.github/workflows/wheel_linux_aarch64.yml
+++ b/.github/workflows/wheel_linux_aarch64.yml
@@ -123,8 +123,13 @@ jobs:
           mkdir Kokkos
           cp -rf ${{ github.workspace }}/Kokkos_install/${{ matrix.exec_model }}/* Kokkos/
 
+      - name: Install Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
       - name: Install dependencies
-        run: python -m pip install cibuildwheel~=2.20.0 tomlkit
+        run: python3.10 -m pip install cibuildwheel~=2.20.0 tomlkit
 
       - name: Configure pyproject.toml file
         run: PL_BACKEND="${{ matrix.pl_backend }}" python scripts/configure_pyproject_toml.py
diff --git a/.github/workflows/wheel_linux_aarch64_cuda.yml b/.github/workflows/wheel_linux_aarch64_cuda.yml
index cc87f033c9..4864fa0167 100644
--- a/.github/workflows/wheel_linux_aarch64_cuda.yml
+++ b/.github/workflows/wheel_linux_aarch64_cuda.yml
@@ -48,8 +48,13 @@ jobs:
       - name: Checkout PennyLane-Lightning
         uses: actions/checkout@v4
 
+      - name: Install Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
       - name: Install cibuildwheel
-        run: python -m pip install cibuildwheel~=2.20.0 tomlkit
+        run: python3.10 -m pip install cibuildwheel~=2.20.0 tomlkit
 
       - name: Configure pyproject.toml file
         run: PL_BACKEND="${{ matrix.pl_backend }}" python scripts/configure_pyproject_toml.py
diff --git a/.github/workflows/wheel_noarch.yml b/.github/workflows/wheel_noarch.yml
index 11460cac1e..0414fcd7b8 100644
--- a/.github/workflows/wheel_noarch.yml
+++ b/.github/workflows/wheel_noarch.yml
@@ -50,7 +50,6 @@ jobs:
         if: ${{ matrix.pl_backend == 'lightning_qubit'}}
         uses: actions/checkout@v4
 
-
       - uses: actions/setup-python@v5
         if: ${{ matrix.pl_backend == 'lightning_qubit'}}
         with:
diff --git a/MANIFEST.in b/MANIFEST.in
index 4c1a79b51d..23ba93b561 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,7 +3,7 @@ include cmake/*
 include requirements.txt
 include .github/CHANGELOG.md
 include pennylane_lightning/lightning_qubit/lightning_qubit.toml
-include pennylane_lightning/lightning_qpu/lightning_gpu.toml
+include pennylane_lightning/lightning_gpu/lightning_gpu.toml
 include pennylane_lightning/lightning_kokkos/lightning_kokkos.toml
 include pennylane_lightning/core/_version.py
 graft pennylane_lightning/core/src/
diff --git a/Makefile b/Makefile
index f43c9e903f..5973200c52 100644
--- a/Makefile
+++ b/Makefile
@@ -35,9 +35,11 @@ help:
 	@echo "  test-cpp [verbose=1]     to run the C++ test suite (requires CMake)"
 	@echo "                           use with 'verbose=1' for building with verbose flag"
 	@echo "  test-cpp [target=?]      to run a specific C++ test target (requires CMake)."
+	@echo "  test-cpp-mpi [backend=?] to run the C++ test suite with MPI (requires CMake and MPI)"
+	@echo "                           Default: lightning_gpu"
 	@echo "  test-python [device=?]   to run the Python test suite"
 	@echo "                           Default: lightning.qubit"
-	@echo "  wheel [backend=?]        to configure and build Python wheels
+	@echo "  wheel [backend=?]        to configure and build Python wheels"
 	@echo "                           Default: lightning_qubit"
 	@echo "  coverage [device=?]      to generate a coverage report for python interface"
 	@echo "                           Default: lightning.qubit"
@@ -98,7 +100,7 @@ coverage-cpp:
 	lcov --directory . -b ../pennylane_lightning/core/src/ --capture --output-file coverage.info; \
 	genhtml coverage.info --output-directory out
 
-.PHONY: test-python test-builtin test-suite test-cpp
+.PHONY: test-python test-builtin test-suite test-cpp test-cpp-mpi
 test-python: test-builtin test-suite
 
 test-builtin:
@@ -124,6 +126,27 @@ else
 	cmake --build ./BuildTests $(VERBOSE) --target test
 endif
 
+test-cpp-mpi:
+	rm -rf ./BuildTests
+	cmake -BBuildTests -G Ninja \
+		  -DCMAKE_BUILD_TYPE=Debug \
+		  -DBUILD_TESTS=ON \
+		  -DENABLE_WARNINGS=ON \
+		  -DPL_BACKEND=lightning_gpu \
+		  -DENABLE_MPI=ON \
+		  $(OPTIONS)
+ifdef target
+	cmake --build ./BuildTests $(VERBOSE) --target $(target)
+	mpirun -np 2 ./BuildTests/$(target)
+else
+	cmake --build ./BuildTests $(VERBOSE)
+	for file in ./BuildTests/*_test_runner_mpi; do \
+		echo "Running $$file"; \
+		mpirun -np 2 $$file ; \
+	done
+endif
+
+
 .PHONY: format format-cpp format-python
 format: format-cpp format-python
 
diff --git a/doc/lightning_gpu/device.rst b/doc/lightning_gpu/device.rst
index a5162c7579..405ea9764d 100644
--- a/doc/lightning_gpu/device.rst
+++ b/doc/lightning_gpu/device.rst
@@ -11,9 +11,9 @@ A ``lightning.gpu`` device can be loaded using:
     import pennylane as qml
     dev = qml.device("lightning.gpu", wires=2)
 
-If the NVIDIA cuQuantum libraries are available, the above device will allow all operations to be performed on a CUDA capable GPU of generation SM 7.0 (Volta) and greater. If the libraries are not correctly installed, or available on path, the device will fall-back to ``lightning.qubit`` and perform all simulation on the CPU.
+If the NVIDIA cuQuantum libraries are available, the above device will allow all operations to be performed on a CUDA capable GPU of generation SM 7.0 (Volta) and greater. If the libraries are not correctly installed, or available on path, the device will raise an error.
 
-The ``lightning.gpu`` device also directly supports quantum circuit gradients using the adjoint differentiation method. This can be enabled at the PennyLane QNode level with:
+The ``lightning.gpu`` device supports quantum circuit gradients using the adjoint differentiation method by default. This can be enabled at the PennyLane QNode level with:
 
 .. code-block:: python
 
@@ -281,3 +281,6 @@ To enable the memory-optimized adjoint method with MPI support, ``batch_obs`` sh
     dev = qml.device('lightning.gpu', wires= n_wires, mpi=True, batch_obs=True)
 
 For the adjoint method, each MPI process will provide the overall simulation results.
+
+.. note::
+    The observable ``Projector``` does not have support with the multi-GPU backend.
diff --git a/mpitests/conftest.py b/mpitests/conftest.py
index a2084f2a5d..552cf9f330 100644
--- a/mpitests/conftest.py
+++ b/mpitests/conftest.py
@@ -98,6 +98,13 @@ def get_device():
 # Device specification
 if device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
+    from pennylane_lightning.lightning_gpu._measurements import (
+        LightningGPUMeasurements as LightningMeasurements,
+    )
+    from pennylane_lightning.lightning_gpu._state_vector import (
+        LightningGPUStateVector as LightningStateVector,
+    )
+
 else:
     raise qml.DeviceError(f"The MPI tests do not apply to the {device_name} device.")
 
diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 6f3b5c7f5b..9d56dfdb1a 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -26,17 +26,15 @@
 from pennylane import QNode
 from pennylane import numpy as np
 from pennylane import qnode
+from pennylane.devices import ExecutionConfig
+from pennylane.tape import QuantumScript
 from scipy.stats import unitary_group
 
+from pennylane_lightning.lightning_gpu_ops import LightningException
+
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
-I, X, Y, Z = (
-    np.eye(2),
-    qml.PauliX.compute_matrix(),
-    qml.PauliY.compute_matrix(),
-    qml.PauliZ.compute_matrix(),
-)
 
 # Tuple passed to distributed device ctor
 # np.complex for data type and True or False
@@ -59,265 +57,255 @@ def fixture_dev(request):
     )
 
 
-def Rx(theta):
-    r"""One-qubit rotation about the x axis.
-
-    Args:
-        theta (float): rotation angle
-    Returns:
-        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_x \theta/2}`
-    """
-    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * X
-
-
-def Ry(theta):
-    r"""One-qubit rotation about the y axis.
-
-    Args:
-        theta (float): rotation angle
-    Returns:
-        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_y \theta/2}`
-    """
-    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * Y
-
-
-def Rz(theta):
-    r"""One-qubit rotation about the z axis.
-
-    Args:
-        theta (float): rotation angle
-    Returns:
-        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_z \theta/2}`
-    """
-    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * Z
-
-
 class TestAdjointJacobian:  # pylint: disable=too-many-public-methods
     """Tests for the adjoint_jacobian method"""
 
-    def test_not_expval(self, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_not_expval(self, dev, batch_obs):
         """Test if a QuantumFunctionError is raised for a tape with measurements that are not
         expectation values"""
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.1, wires=0)
-            qml.var(qml.PauliZ(0))
+        qs = QuantumScript([qml.RX(1.23, 0)], [qml.var(qml.PauliZ(0))], trainable_params=[0])
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
         with pytest.raises(
             qml.QuantumFunctionError, match="Adjoint differentiation method does not"
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.1, wires=0)
-            qml.state()
+        qs = QuantumScript([qml.RX(1.23, 0)], [qml.state()], trainable_params=[0])
 
-        if device_name == "lightning.gpu":
-            message = "Adjoint differentiation does not support State measurements."
-        else:
-            message = "Adjoint differentiation method does not support measurement StateMP."
         with pytest.raises(
             qml.QuantumFunctionError,
-            match=message,
+            match="Adjoint differentiation method does not support measurement StateMP.",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
-    def test_finite_shots_warns(self):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_finite_shots_warns(self, dev, batch_obs):
         """Tests warning raised when finite shots specified"""
 
-        dev = qml.device(device_name, wires=8, mpi=True, shots=1)
-
-        with qml.tape.QuantumTape() as tape:
-            qml.expval(qml.PauliZ(0))
+        qs = QuantumScript(
+            [qml.RX(1.23, 0)], [qml.expval(qml.Z(0))], shots=10, trainable_params=[0]
+        )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        with pytest.warns(
-            UserWarning,
+        with pytest.raises(
+            qml.QuantumFunctionError,
             match="Requested adjoint differentiation to be computed with finite shots.",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
     def test_empty_measurements(self, dev):
         """Tests if an empty array is returned when the measurements of the tape is empty."""
 
-        with qml.tape.QuantumTape() as tape:
+        def circuit():
             qml.RX(0.4, wires=[0])
+            return qml.expval(qml.PauliZ(0))
+
+        result = QNode(circuit, dev, diff_method="adjoint")
+
+        jac = qml.grad(result)()
 
-        jac = dev.adjoint_jacobian(tape)
         assert len(jac) == 0
 
-    def test_unsupported_op(self, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_unsupported_op(self, batch_obs, dev):
         """Test if a QuantumFunctionError is raised for an unsupported operation, i.e.,
         multi-parameter operations that are not qml.Rot"""
 
-        with qml.tape.QuantumTape() as tape:
-            qml.CRot(0.1, 0.2, 0.3, wires=[0, 1])
-            qml.expval(qml.PauliZ(0))
+        qs = QuantumScript(
+            [qml.CRot(0.1, 0.2, 0.3, wires=[0, 1])],
+            [qml.expval(qml.PauliZ(0))],
+            trainable_params=[0],
+        )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
         with pytest.raises(
-            qml.QuantumFunctionError,
-            match="The CRot operation is not supported using the",
+            LightningException,
+            match="The operation is not supported using the adjoint differentiation method",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
-    def test_proj_unsupported(self, dev):
+    @pytest.mark.skip("WIP: Need a deep review if LGPU accept Projector")
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_proj_unsupported(self, batch_obs, dev):
         """Test if a QuantumFunctionError is raised for a Projector observable"""
-        with qml.tape.QuantumTape() as tape:
-            qml.CRX(0.1, wires=[0, 1])
-            qml.expval(qml.Projector([0, 1], wires=[0, 1]))
+
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
+
+        qs = QuantumScript(
+            [qml.CRX(0.1, wires=[0, 1])],
+            [qml.expval(qml.Projector([0, 1], wires=[0, 1]))],
+            trainable_params=[0],
+        )
 
         with pytest.raises(
             qml.QuantumFunctionError,
             match="differentiation method does not support the Projector",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
-        with qml.tape.QuantumTape() as tape:
-            qml.CRX(0.1, wires=[0, 1])
-            qml.expval(qml.Projector([0], wires=[0]) @ qml.PauliZ(0))
+        qs = QuantumScript(
+            [qml.CRX(0.1, wires=[0, 1])],
+            [qml.expval(qml.Projector([0], wires=[0]) @ qml.PauliZ(0))],
+            trainable_params=[0],
+        )
 
         with pytest.raises(
             qml.QuantumFunctionError,
             match="differentiation method does not support the Projector",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
+
+    @staticmethod
+    def tol_for_allclose(c_dtype):
+        """Compute the tolerance for allclose"""
+        return 1e-3 if c_dtype == np.complex64 else 1e-7
 
     @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
     @pytest.mark.parametrize("G", [qml.RX, qml.RY, qml.RZ])
     @pytest.mark.parametrize("stateprep", [qml.QubitStateVector, qml.StatePrep])
-    def test_pauli_rotation_gradient(self, stateprep, G, theta, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_pauli_rotation_gradient(
+        self, stateprep, G, theta, batch_obs, dev
+    ):  # pylint: disable=too-many-arguments
         """Tests that the automatic gradients of Pauli rotations are correct."""
         random_state = np.array(
             [0.43593284 - 0.02945156j, 0.40812291 + 0.80158023j], requires_grad=False
         )
 
-        tape = qml.tape.QuantumScript(
-            [stateprep(random_state, 0), G(theta, 0)], [qml.expval(qml.PauliZ(0))]
+        qs = QuantumScript(
+            [stateprep(random_state, 0), G(theta, 0)],
+            [qml.expval(qml.PauliZ(0))],
+            trainable_params=[1],
         )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        tape.trainable_params = {1}
-
-        calculated_val = dev.adjoint_jacobian(tape)
+        calculated_val = dev.compute_derivatives(qs, config)
 
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         # compare to finite differences
-        tapes, fn = qml.gradients.param_shift(tape)
+        tapes, fn = qml.gradients.param_shift(qs)
         numeric_val = fn(qml.execute(tapes, dev, None))
         assert np.allclose(calculated_val, numeric_val, atol=tol, rtol=0)
 
     @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
     @pytest.mark.parametrize("stateprep", [qml.QubitStateVector, qml.StatePrep])
-    def test_Rot_gradient(self, stateprep, theta, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_Rot_gradient(self, stateprep, theta, batch_obs, dev):
         """Tests that the device gradient of an arbitrary Euler-angle-parameterized gate is
         correct."""
         params = np.array([theta, theta**3, np.sqrt(2) * theta])
 
-        with qml.tape.QuantumTape() as tape:
-            stateprep(np.array([1.0, -1.0], requires_grad=False) / np.sqrt(2), wires=0)
-            qml.Rot(*params, wires=[0])
-            qml.expval(qml.PauliZ(0))
+        qs = QuantumScript(
+            [
+                stateprep(np.array([1.0, -1.0], requires_grad=False) / np.sqrt(2), wires=0),
+                qml.Rot(*params, wires=[0]),
+            ],
+            [qml.expval(qml.PauliZ(0))],
+            trainable_params=[1, 2, 3],
+        )
 
-        tape.trainable_params = {1, 2, 3}
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        calculated_val = dev.adjoint_jacobian(tape)
+        calculated_val = dev.compute_derivatives(qs, config)
 
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         # compare to finite differences
-        tapes, fn = qml.gradients.param_shift(tape)
+        tapes, fn = qml.gradients.param_shift(qs)
         numeric_val = fn(qml.execute(tapes, dev, None))
         assert np.allclose(calculated_val, numeric_val, atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("par", [1, -2, 1.623, -0.051, 0])  # integers, floats, zero
-    def test_ry_gradient(self, par, tol, dev):
-        """Test that the gradient of the RY gate matches the exact analytic formula."""
-        with qml.tape.QuantumTape() as tape:
-            qml.RY(par, wires=[0])
-            qml.expval(qml.PauliX(0))
-
-        tape.trainable_params = {0}
+    @pytest.mark.parametrize("param", [1, -2, 1.623, -0.051, 0])  # integers, floats, zero
+    @pytest.mark.parametrize(
+        "rotation, meas, expected_func",
+        [
+            (qml.RY, qml.PauliX, lambda x: np.cos(x)),  # pylint: disable=unnecessary-lambda
+            (qml.RX, qml.PauliZ, lambda x: -np.sin(x)),  # pylint: disable=unnecessary-lambda
+        ],
+    )
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_r_gradient(
+        self, tol, param, rotation, meas, expected_func, batch_obs, dev
+    ):  # pylint: disable=too-many-arguments
+        """Test for the gradient of the rotation gate matches the known formula."""
 
-        # gradients
-        exact = np.cos(par)
-        grad_A = dev.adjoint_jacobian(tape)
+        qs = QuantumScript(
+            [rotation(param, wires=0)],
+            [qml.expval(meas(0))],
+            trainable_params=[0],
+        )
 
-        # different methods must agree
-        assert np.allclose(grad_A, exact, atol=tol, rtol=0)
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-    def test_rx_gradient(self, tol, dev):
-        """Test that the gradient of the RX gate matches the known formula."""
-        a = 0.7418
+        # circuit jacobians
+        dev_jacobian = dev.compute_derivatives(qs, config)
+        expected_jacobian = expected_func(param)
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(a, wires=0)
-            qml.expval(qml.PauliZ(0))
+    @staticmethod
+    def process_and_execute_multiple_rx(dev, params, meas, batch_obs):
+        """Compute the circuit with multiple RX gates"""
+        qs = QuantumScript(
+            [qml.RX(params[0], wires=0), qml.RX(params[1], wires=1), qml.RX(params[2], wires=2)],
+            meas,
+            trainable_params=[0, 1, 2],
+        )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
         # circuit jacobians
-        dev_jacobian = dev.adjoint_jacobian(tape)
-        expected_jacobian = -np.sin(a)
-        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+        dev_jacobian = dev.compute_derivatives(qs, config)
 
-    def test_multiple_rx_gradient_pauliz(self, tol, dev):
+        return dev_jacobian
+
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_multiple_rx_gradient_pauliz(self, tol, batch_obs, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result."""
         params = np.array([np.pi, np.pi / 2, np.pi / 3])
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(params[0], wires=0)
-            qml.RX(params[1], wires=1)
-            qml.RX(params[2], wires=2)
-
-            for idx in range(3):
-                qml.expval(qml.PauliZ(idx))
+        meas = [qml.expval(qml.PauliZ(idx)) for idx in range(3)]
 
         # circuit jacobians
-        dev_jacobian = dev.adjoint_jacobian(tape)
+        dev_jacobian = self.process_and_execute_multiple_rx(dev, params, meas, batch_obs)
         expected_jacobian = -np.diag(np.sin(params))
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    def test_multiple_rx_gradient_hermitian(self, tol, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_multiple_rx_gradient_hermitian(self, tol, batch_obs, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result
         with Hermitian observable
         """
-        params = np.array([np.pi, np.pi / 2, np.pi / 3])
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(params[0], wires=0)
-            qml.RX(params[1], wires=1)
-            qml.RX(params[2], wires=2)
+        params = np.array([np.pi, np.pi / 2, np.pi / 3])
 
-            for idx in range(3):
-                qml.expval(qml.Hermitian([[1, 0], [0, -1]], wires=[idx]))
+        meas = [qml.expval(qml.Hermitian([[1, 0], [0, -1]], wires=[idx])) for idx in range(3)]
 
-        tape.trainable_params = {0, 1, 2}
         # circuit jacobians
-        dev_jacobian = dev.adjoint_jacobian(tape)
+        dev_jacobian = self.process_and_execute_multiple_rx(dev, params, meas, batch_obs)
         expected_jacobian = -np.diag(np.sin(params))
 
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
-    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
-
-    def test_multiple_rx_gradient_expval_hermitian(self, tol, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_multiple_rx_gradient_expval_hermitian(self, tol, batch_obs, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result
         with Hermitian observable
         """
         params = np.array([np.pi / 3, np.pi / 4, np.pi / 5])
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(params[0], wires=0)
-            qml.RX(params[1], wires=1)
-            qml.RX(params[2], wires=2)
-
+        meas = [
             qml.expval(
                 qml.Hermitian(
                     [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]],
                     wires=[0, 2],
                 )
             )
+        ]
 
-        tape.trainable_params = {0, 1, 2}
-        dev_jacobian = dev.adjoint_jacobian(tape)
+        dev_jacobian = self.process_and_execute_multiple_rx(dev, params, meas, batch_obs)
         expected_jacobian = np.array(
             [
                 -np.sin(params[0]) * np.cos(params[2]),
@@ -328,37 +316,31 @@ def test_multiple_rx_gradient_expval_hermitian(self, tol, dev):
 
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
-    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
-
-    def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_multiple_rx_gradient_expval_hamiltonian(self, tol, batch_obs, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result
         with Hermitian observable
         """
         params = np.array([np.pi / 3, np.pi / 4, np.pi / 5])
 
-        ham = qml.Hamiltonian(
-            [1.0, 0.3, 0.3, 0.4],
-            [
-                qml.PauliX(0) @ qml.PauliX(1),
-                qml.PauliZ(0),
-                qml.PauliZ(1),
-                qml.Hermitian(
-                    [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]],
-                    wires=[0, 2],
-                ),
-            ],
-        )
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(params[0], wires=0)
-            qml.RX(params[1], wires=1)
-            qml.RX(params[2], wires=2)
-
-            qml.expval(ham)
+        meas = [
+            qml.expval(
+                qml.Hamiltonian(
+                    [1.0, 0.3, 0.3, 0.4],
+                    [
+                        qml.PauliX(0) @ qml.PauliX(1),
+                        qml.PauliZ(0),
+                        qml.PauliZ(1),
+                        qml.Hermitian(
+                            [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]],
+                            wires=[0, 2],
+                        ),
+                    ],
+                )
+            )
+        ]
 
-        tape.trainable_params = {0, 1, 2}
-        dev_jacobian = dev.adjoint_jacobian(tape)
+        dev_jacobian = self.process_and_execute_multiple_rx(dev, params, meas, batch_obs)
         expected_jacobian = (
             0.3 * np.array([-np.sin(params[0]), 0, 0])
             + 0.3 * np.array([0, -np.sin(params[1]), 0])
@@ -374,51 +356,21 @@ def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev):
 
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
-    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
-
-    @pytest.mark.parametrize("obs", [qml.PauliX, qml.PauliY])
     @pytest.mark.parametrize(
-        "op",
+        "meas",
         [
-            qml.RX(0.4, wires=0),
-            qml.RY(0.6, wires=0),
-            qml.RZ(0.8, wires=0),
-            qml.CRX(1.0, wires=[0, 1]),
-            qml.CRY(2.0, wires=[0, 1]),
-            qml.CRZ(3.0, wires=[0, 1]),
-            qml.Rot(0.2, -0.1, 0.2, wires=0),
+            [qml.expval(qml.PauliX(wires=0)), qml.expval(qml.PauliZ(wires=1))],
+            [qml.expval(qml.PauliY(wires=0)), qml.expval(qml.PauliZ(wires=1))],
+            [
+                qml.expval(
+                    qml.Hermitian(
+                        [[0, 0, 1, 1], [0, 1, 2, 1], [1, 2, 1, 0], [1, 1, 0, 0]],
+                        wires=[0, 1],
+                    )
+                )
+            ],
         ],
     )
-    def test_gradients_pauliz(self, op, obs, dev):
-        """Tests that the gradients of circuits match between the finite difference and device
-        methods."""
-        # op.num_wires and op.num_params must be initialized a priori
-        with qml.tape.QuantumTape() as tape:
-            qml.Hadamard(wires=0)
-            qml.RX(0.543, wires=0)
-            qml.CNOT(wires=[0, 1])
-
-            op  # pylint: disable=pointless-statement
-
-            qml.Rot(1.3, -2.3, 0.5, wires=[0])
-            qml.RZ(-0.5, wires=0)
-            qml.adjoint(qml.RY(0.5, wires=1), lazy=False)
-            qml.CNOT(wires=[0, 1])
-
-            qml.expval(obs(wires=0))
-            qml.expval(qml.PauliZ(wires=1))
-
-        tape.trainable_params = set(range(1, 1 + op.num_params))
-
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-
-        # pylint: disable=unnecessary-direct-lambda-call
-        grad_F = (lambda t, fn: fn(qml.execute(t, dev, None)))(*qml.gradients.param_shift(tape))
-        grad_D = dev.adjoint_jacobian(tape)
-
-        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
-
     @pytest.mark.parametrize(
         "op",
         [
@@ -431,119 +383,72 @@ def test_gradients_pauliz(self, op, obs, dev):
             qml.Rot(0.2, -0.1, 0.2, wires=0),
         ],
     )
-    def test_gradients_hermitian(self, op, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_gradients_pauliz_hermitian(self, op, meas, batch_obs, dev):
         """Tests that the gradients of circuits match between the finite difference and device
         methods."""
         # op.num_wires and op.num_params must be initialized a priori
-        with qml.tape.QuantumTape() as tape:
-            qml.Hadamard(wires=0)
-            qml.RX(0.543, wires=0)
-            qml.CNOT(wires=[0, 1])
-
-            op.queue()
-
-            qml.Rot(1.3, -2.3, 0.5, wires=[0])
-            qml.RZ(-0.5, wires=0)
-            qml.adjoint(qml.RY(0.5, wires=1), lazy=False)
-            qml.CNOT(wires=[0, 1])
-
-            qml.expval(
-                qml.Hermitian(
-                    [[0, 0, 1, 1], [0, 1, 2, 1], [1, 2, 1, 0], [1, 1, 0, 0]],
-                    wires=[0, 1],
-                )
-            )
-
-        tape.trainable_params = set(range(1, 1 + op.num_params))
-
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-
-        # pylint: disable=unnecessary-direct-lambda-call
-        grad_F = (lambda t, fn: fn(qml.execute(t, dev, None)))(*qml.gradients.param_shift(tape))
-        grad_D = dev.adjoint_jacobian(tape)
-
-        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
-
-    def test_gradient_gate_with_multiple_parameters_pauliz(self, dev):
-        """Tests that gates with multiple free parameters yield correct gradients."""
-        x, y, z = [0.5, 0.3, -0.7]
-
-        tape = qml.tape.QuantumScript(
+        qs = QuantumScript(
             [
-                qml.RX(0.4, wires=[0]),
-                qml.Rot(x, y, z, wires=[0]),
-                qml.RY(-0.2, wires=[0]),
+                qml.Hadamard(wires=0),
+                qml.RX(0.543, wires=0),
+                qml.CNOT(wires=[0, 1]),
+                op,
+                qml.Rot(1.3, -2.3, 0.5, wires=[0]),
+                qml.RZ(-0.5, wires=0),
+                qml.adjoint(qml.RY(0.5, wires=1), lazy=False),
+                qml.CNOT(wires=[0, 1]),
             ],
-            [qml.expval(qml.PauliZ(0))],
+            meas,
+            trainable_params=list(range(1, 1 + op.num_params)),
         )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        tape.trainable_params = {1, 2, 3}
+        tol = self.tol_for_allclose(dev.c_dtype)
 
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-
-        grad_D = dev.adjoint_jacobian(tape)
-        tapes, fn = qml.gradients.param_shift(tape)
+        tapes, fn = qml.gradients.param_shift(qs)
         grad_F = fn(qml.execute(tapes, dev, None))
 
-        # gradient has the correct shape and every element is nonzero
-        assert len(grad_D) == 3
-        assert all(isinstance(v, np.ndarray) for v in grad_D)
-        assert np.count_nonzero(grad_D) == 3
-        # the different methods agree
+        # circuit jacobians
+        grad_D = dev.compute_derivatives(qs, config)
         assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
 
-    def test_gradient_gate_with_multiple_parameters_hermitian(self, dev):
-        """Tests that gates with multiple free parameters yield correct gradients."""
-        x, y, z = [0.5, 0.3, -0.7]
-
-        tape = qml.tape.QuantumScript(
+    @pytest.mark.parametrize(
+        "meas",
+        [
+            [qml.expval(qml.PauliZ(0))],
+            [qml.expval(qml.Hermitian([[0, 1], [1, 1]], wires=0))],
             [
-                qml.RX(0.4, wires=[0]),
-                qml.Rot(x, y, z, wires=[0]),
-                qml.RY(-0.2, wires=[0]),
+                qml.expval(
+                    qml.Hamiltonian(
+                        [1.0, 0.3, 0.3],
+                        [qml.PauliX(0) @ qml.PauliX(1), qml.PauliZ(0), qml.PauliZ(1)],
+                    )
+                )
             ],
-            [qml.expval(qml.Hermitian([[0, 1], [1, 1]], wires=0))],
-        )
-
-        tape.trainable_params = {1, 2, 3}
-
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-
-        grad_D = dev.adjoint_jacobian(tape)
-        tapes, fn = qml.gradients.param_shift(tape)
-        grad_F = fn(qml.execute(tapes, dev, None))
-
-        # gradient has the correct shape and every element is nonzero
-        assert len(grad_D) == 3
-        assert all(isinstance(v, np.ndarray) for v in grad_D)
-        assert np.count_nonzero(grad_D) == 3
-        # the different methods agree
-        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
-
-    def test_gradient_gate_with_multiple_parameters_hamiltonian(self, dev):
+        ],
+    )
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_gradient_gate_with_multiple_parameters(self, meas, batch_obs, dev):
         """Tests that gates with multiple free parameters yield correct gradients."""
         x, y, z = [0.5, 0.3, -0.7]
 
-        ham = qml.Hamiltonian(
-            [1.0, 0.3, 0.3],
-            [qml.PauliX(0) @ qml.PauliX(1), qml.PauliZ(0), qml.PauliZ(1)],
-        )
-
-        tape = qml.tape.QuantumScript(
+        qs = QuantumScript(
             [
                 qml.RX(0.4, wires=[0]),
                 qml.Rot(x, y, z, wires=[0]),
                 qml.RY(-0.2, wires=[0]),
             ],
-            [qml.expval(ham)],
+            meas,
+            trainable_params=[1, 2, 3],
         )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        tape.trainable_params = {1, 2, 3}
-
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = self.tol_for_allclose(dev.c_dtype)
 
-        grad_D = dev.adjoint_jacobian(tape)
-        tapes, fn = qml.gradients.param_shift(tape)
+        # circuit jacobians
+        grad_D = dev.compute_derivatives(qs, config)
+        tapes, fn = qml.gradients.param_shift(qs)
         grad_F = fn(qml.execute(tapes, dev, None))
 
         # gradient has the correct shape and every element is nonzero
@@ -553,101 +458,45 @@ def test_gradient_gate_with_multiple_parameters_hamiltonian(self, dev):
         # the different methods agree
         assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
 
-    def test_use_device_state(self, tol, dev):
-        """Tests that when using the device state, the correct answer is still returned."""
-
-        x, y, z = [0.5, 0.3, -0.7]
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.Rot(x, y, z, wires=[0])
-            qml.RY(-0.2, wires=[0])
-            qml.expval(qml.PauliZ(0))
-
-        tape.trainable_params = {1, 2, 3}
-
-        dM1 = dev.adjoint_jacobian(tape)
-
-        qml.execute([tape], dev, None)
-        dM2 = dev.adjoint_jacobian(tape, use_device_state=True)
-
-        assert np.allclose(dM1, dM2, atol=tol, rtol=0)
-
-    def test_provide_starting_state(self, tol, dev):
-        """Tests provides correct answer when provided starting state."""
-        comm = MPI.COMM_WORLD
-
-        x, y, z = [0.5, 0.3, -0.7]
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.Rot(x, y, z, wires=[0])
-            qml.RY(-0.2, wires=[0])
-            qml.expval(qml.PauliZ(0))
-
-        tape.trainable_params = {1, 2, 3}
-
-        dM1 = dev.adjoint_jacobian(tape)
 
-        if device_name == "lightning.gpu":
-            local_state_vector = dev.state
-            complex_type = np.complex128 if dev.R_DTYPE == np.float64 else np.complex64
-            state_vector = np.zeros(1 << 8).astype(complex_type)
-            comm.Allgather(local_state_vector, state_vector)
-            qml.execute([tape], dev, None)
-            dM2 = dev.adjoint_jacobian(tape, starting_state=state_vector)
-            assert np.allclose(dM1, dM2, atol=tol, rtol=0)
-
-    def test_provide_wrong_starting_state(self, dev):
-        """Tests raise an exception when provided starting state mismatches."""
-        x, y, z = [0.5, 0.3, -0.7]
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.Rot(x, y, z, wires=[0])
-            qml.RY(-0.2, wires=[0])
-            qml.expval(qml.PauliZ(0))
-
-        tape.trainable_params = {1, 2, 3}
+class TestAdjointJacobianQNode:
+    """Test QNode integration with the adjoint_jacobian method"""
 
-        with pytest.raises(
-            qml.QuantumFunctionError,
-            match="The number of qubits of starting_state must be the same as",
-        ):
-            dev.adjoint_jacobian(tape, starting_state=np.ones(7))
+    # def analytic_rotation(self):
+    I = np.eye(2)
+    X = qml.PauliX.compute_matrix()
+    Y = qml.PauliY.compute_matrix()
+    Z = qml.PauliZ.compute_matrix()
 
-    @pytest.mark.skipif(
-        device_name == "lightning.gpu",
-        reason="Adjoint differentiation does not support State measurements.",
-    )
-    def test_state_return_type(self, dev):
-        """Tests raise an exception when the return type is State"""
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.state()
+    def Rx(self, theta):
+        r"""One-qubit rotation about the x axis.
 
-        tape.trainable_params = {0}
+        Args:
+            theta (float): rotation angle
+        Returns:
+            array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_x \theta/2}`
+        """
+        return math.cos(theta / 2) * self.I + 1j * math.sin(-theta / 2) * self.X
 
-        with pytest.raises(
-            qml.QuantumFunctionError,
-            match="Adjoint differentiation method does not support measurement StateMP.",
-        ):
-            dev.adjoint_jacobian(tape)
+    def Ry(self, theta):
+        r"""One-qubit rotation about the y axis.
 
+        Args:
+            theta (float): rotation angle
+        Returns:
+            array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_y \theta/2}`
+        """
+        return math.cos(theta / 2) * self.I + 1j * math.sin(-theta / 2) * self.Y
 
-class TestAdjointJacobianQNode:
-    """Test QNode integration with the adjoint_jacobian method"""
+    def Rz(self, theta):
+        r"""One-qubit rotation about the z axis.
 
-    @pytest.fixture(params=fixture_params)
-    def dev(self, request):
-        """Returns a PennyLane device."""
-        return qml.device(
-            device_name,
-            wires=8,
-            mpi=True,
-            c_dtype=request.param[0],
-            batch_obs=request.param[1],
-        )
+        Args:
+            theta (float): rotation angle
+        Returns:
+            array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_z \theta/2}`
+        """
+        return math.cos(theta / 2) * self.I + 1j * math.sin(-theta / 2) * self.Z
 
     def test_finite_shots_error(self):
         """Tests that an error is raised when computing the adjoint diff on a device with finite shots"""
@@ -665,6 +514,11 @@ def circ(x):
 
             qml.grad(circ)(0.1)
 
+    @staticmethod
+    def tol_for_allclose(c_dtype):
+        """Compute the tolerance for allclose"""
+        return 1e-3 if c_dtype == np.complex64 else 1e-7
+
     def test_qnode(self, mocker, dev):
         """Test that specifying diff_method allows the adjoint method to be selected"""
         args = np.array([0.54, 0.1, 0.5], requires_grad=True)
@@ -684,15 +538,15 @@ def circuit(x, y, z):
             return qml.expval(qml.PauliX(0) @ qml.PauliZ(1))
 
         qnode1 = QNode(circuit, dev, diff_method="adjoint")
-        spy = mocker.spy(dev.target_device, "adjoint_jacobian")
+        spy = mocker.spy(dev, "LightningAdjointJacobian")
 
         grad_fn = qml.grad(qnode1)
         grad_A = grad_fn(*args)
 
         spy.assert_called()
 
-        h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         qnode2 = QNode(circuit, dev, diff_method="finite-diff", h=h)
         grad_fn = qml.grad(qnode2)
@@ -726,7 +580,7 @@ def cost(p1, p2):
         zero_state = np.array([1.0, 0.0])
         cost(reused_p, other_p)
 
-        spy = mocker.spy(dev.target_device, "adjoint_jacobian")
+        spy = mocker.spy(dev, "LightningAdjointJacobian")
 
         # analytic gradient
         grad_fn = qml.grad(cost)
@@ -737,18 +591,34 @@ def cost(p1, p2):
         # manual gradient
         grad_true0 = (
             expZ(
-                Rx(reused_p) @ Rz(other_p) @ Ry(reused_p + np.pi / 2) @ Rx(extra_param) @ zero_state
+                self.Rx(reused_p)
+                @ self.Rz(other_p)
+                @ self.Ry(reused_p + np.pi / 2)
+                @ self.Rx(extra_param)
+                @ zero_state
             )
             - expZ(
-                Rx(reused_p) @ Rz(other_p) @ Ry(reused_p - np.pi / 2) @ Rx(extra_param) @ zero_state
+                self.Rx(reused_p)
+                @ self.Rz(other_p)
+                @ self.Ry(reused_p - np.pi / 2)
+                @ self.Rx(extra_param)
+                @ zero_state
             )
         ) / 2
         grad_true1 = (
             expZ(
-                Rx(reused_p + np.pi / 2) @ Rz(other_p) @ Ry(reused_p) @ Rx(extra_param) @ zero_state
+                self.Rx(reused_p + np.pi / 2)
+                @ self.Rz(other_p)
+                @ self.Ry(reused_p)
+                @ self.Rx(extra_param)
+                @ zero_state
             )
             - expZ(
-                Rx(reused_p - np.pi / 2) @ Rz(other_p) @ Ry(reused_p) @ Rx(extra_param) @ zero_state
+                self.Rx(reused_p - np.pi / 2)
+                @ self.Rz(other_p)
+                @ self.Ry(reused_p)
+                @ self.Rx(extra_param)
+                @ zero_state
             )
         ) / 2
         expected = grad_true0 + grad_true1  # product rule
@@ -765,10 +635,10 @@ def circuit(params):
             qml.Rot(params[1], params[0], 2 * params[0], wires=[0])
             return qml.expval(qml.PauliX(0))
 
-        spy_analytic = mocker.spy(dev.target_device, "adjoint_jacobian")
+        spy_analytic = mocker.spy(dev, "LightningAdjointJacobian")
 
-        h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         cost = QNode(circuit, dev, diff_method="finite-diff", h=h)
 
@@ -798,7 +668,7 @@ def f(params1, params2):
             qml.RY(tf.cos(params2), wires=[0])
             return qml.expval(qml.PauliZ(0))
 
-        if dev.R_DTYPE == np.float32:
+        if dev.r_dtype == np.float32:
             tf_r_dtype = tf.float32
         else:
             tf_r_dtype = tf.float64
@@ -806,8 +676,8 @@ def f(params1, params2):
         params1 = tf.Variable(0.3, dtype=tf_r_dtype)
         params2 = tf.Variable(0.4, dtype=tf_r_dtype)
 
-        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         qnode1 = QNode(f, dev, interface="tf", diff_method="adjoint")
         qnode2 = QNode(f, dev, interface="tf", diff_method="finite-diff", h=h)
@@ -839,7 +709,7 @@ def f(params1, params2):
         params1 = torch.tensor(0.3, requires_grad=True)
         params2 = torch.tensor(0.4, requires_grad=True)
 
-        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
 
         qnode1 = QNode(f, dev, interface="torch", diff_method="adjoint")
         qnode2 = QNode(f, dev, interface="torch", diff_method="finite-diff", h=h)
@@ -861,7 +731,7 @@ def test_interface_jax(self, dev):
         jax interface"""
 
         jax = pytest.importorskip("jax")
-        if dev.R_DTYPE == np.float64:
+        if dev.c_dtype == np.complex128:
             from jax import config  # pylint: disable=import-outside-toplevel
 
             config.update("jax_enable_x64", True)
@@ -872,11 +742,13 @@ def f(params1, params2):
             qml.RY(jax.numpy.cos(params2), wires=[0])
             return qml.expval(qml.PauliZ(0))
 
-        params1 = jax.numpy.array(0.3, dev.R_DTYPE)
-        params2 = jax.numpy.array(0.4, dev.R_DTYPE)
+        r_dtype = np.float32 if dev.c_dtype == np.complex64 else np.float64
+
+        params1 = jax.numpy.array(0.3, r_dtype)
+        params2 = jax.numpy.array(0.4, r_dtype)
 
-        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         qnode_adjoint = QNode(f, dev, interface="jax", diff_method="adjoint")
         qnode_fd = QNode(f, dev, interface="jax", diff_method="finite-diff", h=h)
@@ -1379,8 +1251,8 @@ def test_qubit_unitary(dev, n_targets):
     """Tests that ``qml.QubitUnitary`` can be included in circuits differentiated with the adjoint method."""
     n_wires = len(dev.wires)
     dev_def = qml.device("default.qubit", wires=n_wires)
-    h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-    c_dtype = np.complex64 if dev.R_DTYPE == np.float32 else np.complex128
+    h = 1e-3 if dev.c_dtype == np.complex64 else 1e-7
+    c_dtype = dev.c_dtype
 
     np.random.seed(1337)
     par = 2 * np.pi * np.random.rand(n_wires)
@@ -1427,8 +1299,8 @@ def test_diff_qubit_unitary(dev, n_targets):
     """Tests that ``qml.QubitUnitary`` can be differentiated with the adjoint method."""
     n_wires = len(dev.wires)
     dev_def = qml.device("default.qubit", wires=n_wires)
-    h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-    c_dtype = np.complex64 if dev.R_DTYPE == np.float32 else np.complex128
+    h = 1e-3 if dev.c_dtype == np.complex64 else 1e-7
+    c_dtype = dev.c_dtype
 
     np.random.seed(1337)
     par = 2 * np.pi * np.random.rand(n_wires)
diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py
index 17d91cd2d7..5987626f1f 100644
--- a/mpitests/test_apply.py
+++ b/mpitests/test_apply.py
@@ -34,14 +34,17 @@
 )
 
 
-def create_random_init_state(numWires, R_DTYPE, seed_value=48):
+def create_random_init_state(numWires, c_dtype, seed_value=48):
     """Returns a random initial state of a certain type."""
     np.random.seed(seed_value)
-    num_elements = 1 << numWires
-    init_state = np.random.rand(num_elements).astype(R_DTYPE) + 1j * np.random.rand(
+
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
+
+    num_elements = 2**numWires
+    init_state = np.random.rand(num_elements).astype(r_dtype) + 1j * np.random.rand(
         num_elements
-    ).astype(R_DTYPE)
-    scale_sum = np.sqrt(np.sum(np.abs(init_state) ** 2)).astype(R_DTYPE)
+    ).astype(r_dtype)
+    scale_sum = np.sqrt(np.sum(np.abs(init_state) ** 2)).astype(r_dtype)
     init_state = init_state / scale_sum
     return init_state
 
@@ -54,16 +57,13 @@ def apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires):
     num_global_wires = commSize.bit_length() - 1
     num_local_wires = num_wires - num_global_wires
 
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
+    c_dtype = dev_mpi.c_dtype
 
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+    expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     comm.Scatter(state_vector, local_state_vector, root=0)
@@ -84,45 +84,6 @@ def circuit(*params):
     assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
-def apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires):
-    """Wrapper applying a parametric gate with the apply method."""
-    num_wires = numQubits
-    comm = MPI.COMM_WORLD
-    commSize = comm.Get_size()
-    num_global_wires = commSize.bit_length() - 1
-    num_local_wires = num_wires - num_global_wires
-
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
-
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
-
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
-    comm.Bcast(state_vector, root=0)
-
-    comm.Scatter(state_vector, local_state_vector, root=0)
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
-
-    @qml.qnode(dev_cpu)
-    def circuit(*params):
-        qml.StatePrep(state_vector, wires=range(num_wires))
-        operation(*params, wires=Wires)
-        return qml.state()
-
-    expected_output_cpu = np.array(circuit(*par)).astype(c_dtype)
-    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
-
-    dev_mpi.syncH2D(local_state_vector)
-    dev_mpi.apply([operation(*par, wires=Wires)])
-    dev_mpi.syncD2H(local_state_vector)
-
-    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
-
-
 def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
     """Wrapper applying a non-parametric gate with QNode function."""
     num_wires = numQubits
@@ -131,16 +92,13 @@ def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
     num_global_wires = commSize.bit_length() - 1
     num_local_wires = num_wires - num_global_wires
 
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
+    c_dtype = dev_mpi.c_dtype
 
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+    expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     comm.Scatter(state_vector, local_state_vector, root=0)
@@ -161,45 +119,6 @@ def circuit():
     assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
-def apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires):
-    """Wrapper applying a non-parametric gate with the apply method."""
-    num_wires = numQubits
-    comm = MPI.COMM_WORLD
-    commSize = comm.Get_size()
-    num_global_wires = commSize.bit_length() - 1
-    num_local_wires = num_wires - num_global_wires
-
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
-
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
-
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
-    comm.Bcast(state_vector, root=0)
-
-    comm.Scatter(state_vector, local_state_vector, root=0)
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
-
-    @qml.qnode(dev_cpu)
-    def circuit():
-        qml.StatePrep(state_vector, wires=range(num_wires))
-        operation(wires=Wires)
-        return qml.state()
-
-    expected_output_cpu = np.array(circuit()).astype(c_dtype)
-    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
-
-    dev_mpi.syncH2D(local_state_vector)
-    dev_mpi.apply([operation(wires=Wires)])
-    dev_mpi.syncD2H(local_state_vector)
-
-    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
-
-
 class TestApply:  # pylint: disable=missing-function-docstring,too-many-arguments
     """Tests whether the device can apply supported quantum gates."""
 
@@ -220,13 +139,11 @@ def dev_mpi(self, request):
     @pytest.mark.parametrize("Wires", [0, 1, numQubits - 2, numQubits - 1])
     def test_apply_operation_single_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CNOT, qml.SWAP, qml.CY, qml.CZ])
     @pytest.mark.parametrize("Wires", [[0, 1], [numQubits - 2, numQubits - 1], [0, numQubits - 1]])
     def test_apply_operation_two_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
     @pytest.mark.parametrize(
@@ -240,7 +157,6 @@ def test_apply_operation_two_wire_nonparam(self, tol, operation, Wires, dev_mpi)
     )
     def test_apply_operation_three_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
     @pytest.mark.parametrize(
@@ -254,7 +170,6 @@ def test_apply_operation_three_wire_nonparam(self, tol, operation, Wires, dev_mp
     )
     def test_apply_operation_three_wire_qnode_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.PhaseShift, qml.RX, qml.RY, qml.RZ])
     @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
@@ -263,7 +178,6 @@ def test_apply_operation_1gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize("operation", [qml.Rot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
@@ -272,7 +186,6 @@ def test_apply_operation_1gatequbit_3param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CRot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
@@ -281,7 +194,6 @@ def test_apply_operation_1gatequbit_3param_cgate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize(
         "operation",
@@ -304,7 +216,6 @@ def test_apply_operation_2gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize(
         "operation",
@@ -323,7 +234,6 @@ def test_apply_operation_4gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     # BasisState test
     @pytest.mark.parametrize("operation", [qml.BasisState])
@@ -337,17 +247,17 @@ def test_state_prep(self, tol, operation, index, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
 
-        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
-        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+        state_vector = np.zeros(2**num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -399,17 +309,17 @@ def test_qubit_state_prep(self, tol, par, Wires, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
 
-        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
-        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+        state_vector = np.zeros(2**num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -435,17 +345,17 @@ def test_dev_reset(self, tol, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
 
-        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
-        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+        state_vector = np.zeros(2**num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -462,10 +372,10 @@ def circuit():
         expected_output_cpu = cpu_qnode().astype(c_dtype)
         comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
 
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state()
 
         gpumpi_qnode = qml.QNode(circuit, dev_mpi)
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state()
 
         local_state_vector = gpumpi_qnode()
         assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
@@ -474,8 +384,8 @@ def circuit():
 class TestSparseHamExpval:  # pylint: disable=too-few-public-methods,missing-function-docstring
     """Tests sparse hamiltonian expectation values."""
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_sparse_hamiltonian_expectation(self, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_sparse_hamiltonian_expectation(self, c_dtype):
         comm = MPI.COMM_WORLD
         commSize = comm.Get_size()
         num_global_wires = commSize.bit_length() - 1
@@ -496,32 +406,38 @@ def test_sparse_hamiltonian_expectation(self, C_DTYPE):
                 0.3 + 0.3j,
                 0.3 + 0.5j,
             ],
-            dtype=C_DTYPE,
+            dtype=c_dtype,
         )
 
-        local_state_vector = np.zeros(1 << num_local_wires).astype(C_DTYPE)
+        state_vector /= np.linalg.norm(state_vector)
+
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
         comm.Scatter(state_vector, local_state_vector, root=0)
 
-        dev_gpu = qml.device("lightning.gpu", wires=3, mpi=False, c_dtype=C_DTYPE)
-        dev_mpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=C_DTYPE)
+        H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3))
 
-        dev_mpi.syncH2D(local_state_vector)
-        dev_gpu.syncH2D(state_vector)
+        def circuit():
+            qml.StatePrep(state_vector, wires=range(3))
+            return qml.expval(H_sparse)
 
-        H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3))
+        dev_gpu = qml.device("lightning.gpu", wires=3, mpi=False, c_dtype=c_dtype)
+        gpu_qnode = qml.QNode(circuit, dev_gpu)
+        expected_output_gpu = gpu_qnode()
+        comm.Bcast(np.array(expected_output_gpu), root=0)
 
-        comm.Barrier()
+        dev_mpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=c_dtype)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+        expected_output_mpi = mpi_qnode()
 
-        res = dev_mpi.expval(H_sparse)
-        expected = dev_gpu.expval(H_sparse)
+        comm.Barrier()
 
-        assert np.allclose(res, expected)
+        assert np.allclose(expected_output_mpi, expected_output_gpu)
 
 
 class TestExpval:
     """Tests that expectation values are properly calculated or that the proper errors are raised."""
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
     @pytest.mark.parametrize(
         "operation",
         [
@@ -533,7 +449,7 @@ class TestExpval:
         ],
     )
     @pytest.mark.parametrize("wires", [0, 1, 2, numQubits - 3, numQubits - 2, numQubits - 1])
-    def test_expval_single_wire_no_parameters(self, tol, operation, wires, C_DTYPE):
+    def test_expval_single_wire_no_parameters(self, tol, operation, wires, c_dtype):
         """Tests that expectation values are properly calculated for single-wire observables without parameters."""
         num_wires = numQubits
         comm = MPI.COMM_WORLD
@@ -541,14 +457,14 @@ def test_expval_single_wire_no_parameters(self, tol, operation, wires, C_DTYPE):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        dev_mpi = qml.device("lightning.gpu", wires=numQubits, mpi=True, c_dtype=C_DTYPE)
+        dev_mpi = qml.device("lightning.gpu", wires=numQubits, mpi=True, c_dtype=c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
         comm.Bcast(state_vector, root=0)
 
-        local_state_vector = np.zeros(1 << num_local_wires).astype(C_DTYPE)
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
         comm.Scatter(state_vector, local_state_vector, root=0)
-        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
 
         def circuit():
             qml.StatePrep(state_vector, wires=range(num_wires))
@@ -563,7 +479,7 @@ def circuit():
 
         assert np.allclose(expected_output_mpi, expected_output_cpu, atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
     @pytest.mark.parametrize(
         "obs",
         [
@@ -575,12 +491,12 @@ def circuit():
             qml.PauliZ(numQubits - 2) @ qml.PauliZ(numQubits - 1),
         ],
     )
-    def test_expval_multiple_obs(self, obs, tol, C_DTYPE):
+    def test_expval_multiple_obs(self, obs, tol, c_dtype):
         """Test expval with Hamiltonian"""
         num_wires = numQubits
 
-        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
-        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype)
 
         def circuit():
             qml.RX(0.4, wires=[0])
@@ -592,7 +508,7 @@ def circuit():
 
         assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
     @pytest.mark.parametrize(
         "obs, coeffs",
         [
@@ -620,14 +536,14 @@ def circuit():
             ),
         ],
     )
-    def test_expval_hamiltonian(self, obs, coeffs, tol, C_DTYPE):
+    def test_expval_hamiltonian(self, obs, coeffs, tol, c_dtype):
         """Test expval with Hamiltonian"""
         num_wires = numQubits
 
         ham = qml.Hamiltonian(coeffs, obs)
 
-        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
-        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype)
 
         def circuit():
             qml.RX(0.4, wires=[0])
@@ -665,14 +581,14 @@ def circuit():
 class TestGenerateSample:
     """Tests that samples are properly calculated."""
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_sample_dimensions(self, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_sample_dimensions(self, c_dtype):
         """Tests if the samples returned by sample have
         the correct dimensions
         """
         num_wires = numQubits
 
-        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype)
 
         ops = [qml.RX(1.5708, wires=[0]), qml.RX(1.5708, wires=[1])]
 
@@ -697,14 +613,14 @@ def test_sample_dimensions(self, C_DTYPE):
 
         assert np.array_equal(s3.shape, (shots,))
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_sample_values(self, tol, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_sample_values(self, tol, c_dtype):
         """Tests if the samples returned by sample have
         the correct values
         """
         num_wires = numQubits
 
-        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype)
 
         shots = qml.measurements.Shots(1000)
         ops = [qml.RX(1.5708, wires=[0])]
@@ -716,17 +632,17 @@ def test_sample_values(self, tol, C_DTYPE):
         # they square to 1
         assert np.allclose(s1**2, 1, atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_sample_values_qnode(self, tol, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_sample_values_qnode(self, tol, c_dtype):
         """Tests if the samples returned by sample have
         the correct values
         """
         num_wires = numQubits
 
         dev_mpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state()
 
         @qml.qnode(dev_mpi)
         def circuit():
@@ -737,15 +653,15 @@ def circuit():
         # they square to 1
         assert np.allclose(circuit() ** 2, 1, atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_multi_samples_return_correlated_results(self, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_multi_samples_return_correlated_results(self, c_dtype):
         """Tests if the samples returned by the sample function have
         the correct dimensions
         """
         num_wires = 3
 
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         @qml.qnode(dev_gpumpi)
@@ -758,13 +674,13 @@ def circuit():
 
         assert np.array_equal(outcomes[0], outcomes[1])
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_paulix_pauliy(self, C_DTYPE, tol=TOL_STOCHASTIC):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_paulix_pauliy(self, c_dtype, tol=TOL_STOCHASTIC):
         """Test that a tensor product involving PauliX and PauliY works correctly"""
         num_wires = 3
 
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         theta = 0.432
@@ -800,13 +716,13 @@ def circuit():
         ) / 16
         assert np.allclose(var, expected, atol=tol)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_pauliz_hadamard(self, C_DTYPE, tol=TOL_STOCHASTIC):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_pauliz_hadamard(self, c_dtype, tol=TOL_STOCHASTIC):
         """Test that a tensor product involving PauliZ and PauliY and hadamard works correctly"""
         num_wires = 3
 
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         theta = 0.432
@@ -846,13 +762,13 @@ def circuit():
 class TestTensorVar:
     """Test tensor variance measurements."""
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_paulix_pauliy(self, C_DTYPE, tol=TOL_STOCHASTIC):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_paulix_pauliy(self, c_dtype, tol=TOL_STOCHASTIC):
         """Test that a tensor product involving PauliX and PauliY works correctly"""
         num_wires = 3
 
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         theta = 0.432
@@ -880,12 +796,12 @@ def circuit():
         ) / 16
         assert np.allclose(res, expected, atol=tol)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_pauliz_hadamard(self, C_DTYPE, tol=TOL_STOCHASTIC):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_pauliz_hadamard(self, c_dtype, tol=TOL_STOCHASTIC):
         """Test that a tensor product involving PauliZ and PauliY and hadamard works correctly"""
         num_wires = 3
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         theta = 0.432
diff --git a/mpitests/test_device.py b/mpitests/test_device.py
index 03a1880114..dd783dbee7 100644
--- a/mpitests/test_device.py
+++ b/mpitests/test_device.py
@@ -38,13 +38,13 @@ def test_create_device():
 
 
 def test_unsupported_mpi_buf_size():
-    with pytest.raises(TypeError, match="Unsupported mpi_buf_size value"):
+    with pytest.raises(ValueError, match="Unsupported mpi_buf_size value"):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=-1)
-    with pytest.raises(TypeError, match="Unsupported mpi_buf_size value"):
+    with pytest.raises(ValueError, match="Unsupported mpi_buf_size value"):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=3)
-    with pytest.warns(
-        RuntimeWarning,
-        match="The MPI buffer size is larger than the local state vector size",
+    with pytest.raises(
+        RuntimeError,
+        match="The MPI buffer size is larger than the local state vector size.",
     ):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=2**4)
     with pytest.raises(
diff --git a/mpitests/test_expval.py b/mpitests/test_expval.py
index d020471c03..3ca73cd82e 100644
--- a/mpitests/test_expval.py
+++ b/mpitests/test_expval.py
@@ -22,114 +22,260 @@
 from conftest import PHI, THETA, VARPHI, device_name
 from mpi4py import MPI
 
+numQubits = 8
 
-@pytest.mark.parametrize("theta, phi", list(zip(THETA, PHI)))
-class TestExpval:
-    """Test expectation values"""
 
-    def test_identity_expectation(self, theta, phi, tol):
-        """Test that identity expectation value (i.e. the trace) is 1"""
-        dev = qml.device(device_name, mpi=True, wires=3)
+def create_random_init_state(numWires, c_dtype, seed_value=48):
+    """Returns a random initial state of a certain type."""
+    np.random.seed(seed_value)
 
-        O1 = qml.Identity(wires=[0])
-        O2 = qml.Identity(wires=[1])
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
 
-        dev.apply(
-            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-        )
+    num_elements = 2**numWires
+    init_state = np.random.rand(num_elements).astype(r_dtype) + 1j * np.random.rand(
+        num_elements
+    ).astype(r_dtype)
 
-        res = np.array([dev.expval(O1), dev.expval(O2)])
-        assert np.allclose(res, np.array([1, 1]), tol)
+    init_state = init_state / np.linalg.norm(init_state)
+    return init_state
 
-    def test_pauliz_expectation(self, theta, phi, tol):
-        """Test that PauliZ expectation value is correct"""
-        dev = qml.device(device_name, mpi=True, wires=3)
 
-        O1 = qml.PauliZ(wires=[0])
-        O2 = qml.PauliZ(wires=[1])
+def apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires):
+    """Wrapper applying a parametric gate with QNode function."""
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    commSize = comm.Get_size()
+    num_global_wires = commSize.bit_length() - 1
+    num_local_wires = num_wires - num_global_wires
 
-        dev.apply(
-            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-        )
+    c_dtype = dev_mpi.c_dtype
 
-        res = np.array([dev.expval(O1), dev.expval(O2)])
-        assert np.allclose(res, np.array([np.cos(theta), np.cos(theta) * np.cos(phi)]), tol)
+    expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-    def test_paulix_expectation(self, theta, phi, tol):
-        """Test that PauliX expectation value is correct"""
-        dev = qml.device(device_name, mpi=True, wires=3)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
+    comm.Bcast(state_vector, root=0)
 
-        O1 = qml.PauliX(wires=[0])
-        O2 = qml.PauliX(wires=[1])
+    comm.Scatter(state_vector, local_state_vector, root=0)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
 
-        dev.apply(
-            [qml.RY(theta, wires=[0]), qml.RY(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-        )
+    def circuit(*params):
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(*params, wires=Wires)
+        return qml.state()
+
+    cpu_qnode = qml.QNode(circuit, dev_cpu)
+    expected_output_cpu = cpu_qnode(*par).astype(c_dtype)
+    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+    mpi_qnode = qml.QNode(circuit, dev_mpi)
+    local_state_vector = mpi_qnode(*par)
+
+    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
+    """Wrapper applying a non-parametric gate with QNode function."""
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    commSize = comm.Get_size()
+    num_global_wires = commSize.bit_length() - 1
+    num_local_wires = num_wires - num_global_wires
+
+    c_dtype = dev_mpi.c_dtype
+
+    expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
+
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
+    comm.Bcast(state_vector, root=0)
 
-        res = np.array([dev.expval(O1), dev.expval(O2)], dtype=dev.C_DTYPE)
-        assert np.allclose(
-            res,
-            np.array([np.sin(theta) * np.sin(phi), np.sin(phi)], dtype=dev.C_DTYPE),
-            tol * 10,
+    comm.Scatter(state_vector, local_state_vector, root=0)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+    def circuit():
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(wires=Wires)
+        return qml.state()
+
+    cpu_qnode = qml.QNode(circuit, dev_cpu)
+    expected_output_cpu = cpu_qnode().astype(c_dtype)
+    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+    mpi_qnode = qml.QNode(circuit, dev_mpi)
+    local_state_vector = mpi_qnode()
+
+    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+@pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+@pytest.mark.parametrize("batch_obs", [True, False])
+class TestExpval:
+    """Tests that expectation values are properly calculated or that the proper errors are raised."""
+
+    @pytest.mark.parametrize(
+        "operation",
+        [
+            qml.PauliX,
+            qml.PauliY,
+            qml.PauliZ,
+            qml.Hadamard,
+            qml.Identity,
+        ],
+    )
+    @pytest.mark.parametrize("wires", [0, 1, 2, numQubits - 2, numQubits - 1])
+    def test_expval_single_wire_no_parameters(self, tol, operation, wires, c_dtype, batch_obs):
+        """Tests that expectation values are properly calculated for single-wire observables without parameters."""
+        num_wires = numQubits
+        comm = MPI.COMM_WORLD
+
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=numQubits, mpi=True, c_dtype=c_dtype, batch_obs=batch_obs
         )
 
-    def test_pauliy_expectation(self, theta, phi, tol):
-        """Test that PauliY expectation value is correct"""
-        dev = qml.device(device_name, mpi=True, wires=3)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
 
-        O1 = qml.PauliY(wires=[0])
-        O2 = qml.PauliY(wires=[1])
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
+        comm.Bcast(state_vector, root=0)
 
-        dev.apply(
-            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        def circuit():
+            qml.StatePrep(state_vector, wires=range(num_wires))
+            return qml.expval(operation(wires))
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        expected_output_cpu = cpu_qnode()
+        comm.Bcast(np.array(expected_output_cpu), root=0)
+
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+        expected_output_mpi = mpi_qnode()
+
+        assert np.allclose(expected_output_mpi, expected_output_cpu, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize(
+        "obs",
+        [
+            qml.PauliX(0) @ qml.PauliZ(1),
+            qml.PauliX(0) @ qml.PauliZ(numQubits - 1),
+            qml.PauliX(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+            qml.PauliZ(0) @ qml.PauliZ(1),
+            qml.PauliZ(0) @ qml.PauliZ(numQubits - 1),
+            qml.PauliZ(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+        ],
+    )
+    def test_expval_multiple_obs(self, obs, tol, c_dtype, batch_obs):
+        """Test expval with Hamiltonian"""
+        num_wires = numQubits
+
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype, batch_obs=batch_obs
         )
 
-        res = np.array([dev.expval(O1), dev.expval(O2)])
-        assert np.allclose(res, np.array([0, -np.cos(theta) * np.sin(phi)]), tol)
+        def circuit():
+            qml.RX(0.4, wires=[0])
+            qml.RY(-0.2, wires=[num_wires - 1])
+            return qml.expval(obs)
 
-    def test_hadamard_expectation(self, theta, phi, tol):
-        """Test that Hadamard expectation value is correct"""
-        dev = qml.device(device_name, mpi=True, wires=3)
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
+
+    @pytest.mark.parametrize(
+        "obs, coeffs",
+        [
+            ([qml.PauliX(0) @ qml.PauliZ(1)], [0.314]),
+            ([qml.PauliX(0) @ qml.PauliZ(numQubits - 1)], [0.314]),
+            ([qml.PauliZ(0) @ qml.PauliZ(1)], [0.314]),
+            ([qml.PauliZ(0) @ qml.PauliZ(numQubits - 1)], [0.314]),
+            (
+                [qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)],
+                [0.314, 0.2],
+            ),
+            (
+                [
+                    qml.PauliX(0) @ qml.PauliZ(numQubits - 1),
+                    qml.PauliZ(0) @ qml.PauliZ(1),
+                ],
+                [0.314, 0.2],
+            ),
+            (
+                [
+                    qml.PauliX(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+                    qml.PauliZ(0) @ qml.PauliZ(1),
+                ],
+                [0.314, 0.2],
+            ),
+        ],
+    )
+    def test_expval_hamiltonian(self, obs, coeffs, tol, c_dtype, batch_obs):
+        """Test expval with Hamiltonian"""
+        num_wires = numQubits
+
+        ham = qml.Hamiltonian(coeffs, obs)
+
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype, batch_obs=batch_obs
+        )
+
+        def circuit():
+            qml.RX(0.4, wires=[0])
+            qml.RY(-0.2, wires=[numQubits - 1])
+            return qml.expval(ham)
 
-        O1 = qml.Hadamard(wires=[0])
-        O2 = qml.Hadamard(wires=[1])
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
 
-        dev.apply(
-            [qml.RY(theta, wires=[0]), qml.RY(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
+
+    def test_expval_non_pauli_word_hamiltionian(self, tol, c_dtype, batch_obs):
+        """Tests expectation values of non-Pauli word Hamiltonians."""
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=3, mpi=True, c_dtype=c_dtype, batch_obs=batch_obs
         )
+        dev_cpu = qml.device("lightning.qubit", wires=3)
+
+        theta = 0.432
+        phi = 0.123
+        varphi = -0.543
+
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.expval(0.5 * qml.Hadamard(2))
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
 
-        res = np.array([dev.expval(O1), dev.expval(O2)])
-        expected = np.array(
-            [
-                np.sin(theta) * np.sin(phi) + np.cos(theta),
-                np.cos(theta) * np.cos(phi) + np.sin(phi),
-            ]
-        ) / np.sqrt(2)
-        assert np.allclose(res, expected, tol)
-
-    @pytest.mark.parametrize("n_wires", range(1, 8))
-    def test_hermitian_expectation(self, n_wires, theta, phi, tol):
+    @pytest.mark.parametrize("theta, phi", list(zip(THETA, PHI)))
+    @pytest.mark.parametrize("n_wires", range(1, numQubits))
+    def test_hermitian_expectation(self, n_wires, theta, phi, tol, c_dtype, batch_obs):
         """Test that Hadamard expectation value is correct"""
-        n_qubits = 7
+        n_qubits = numQubits - 1
         dev_def = qml.device("default.qubit", wires=n_qubits)
-        dev = qml.device(device_name, mpi=True, wires=n_qubits)
+        dev = qml.device(
+            device_name, mpi=True, wires=n_qubits, c_dtype=c_dtype, batch_obs=batch_obs
+        )
         comm = MPI.COMM_WORLD
 
         m = 2**n_wires
         U = np.random.rand(m, m) + 1j * np.random.rand(m, m)
         U = U + np.conj(U.T)
-        U = U.astype(dev.C_DTYPE)
+        U = U.astype(dev.c_dtype)
         comm.Bcast(U, root=0)
         obs = qml.Hermitian(U, wires=range(n_wires))
 
         init_state = np.random.rand(2**n_qubits) + 1j * np.random.rand(2**n_qubits)
-        init_state /= np.sqrt(np.dot(np.conj(init_state), init_state))
-        init_state = init_state.astype(dev.C_DTYPE)
+        init_state = init_state / np.linalg.norm(init_state)
+        init_state = init_state.astype(dev.c_dtype)
         comm.Bcast(init_state, root=0)
 
         def circuit():
@@ -250,69 +396,39 @@ def circuit(x, y):
 class TestTensorExpval:
     """Test tensor expectation values"""
 
-    def test_paulix_pauliy(self, theta, phi, varphi, tol):
+    @pytest.mark.parametrize(
+        "obs,expected",
+        [
+            (qml.PauliX(0) @ qml.PauliY(2), "PXPY"),
+            (qml.PauliZ(0) @ qml.Identity(1) @ qml.PauliZ(2), "PZIPZ"),
+            (qml.PauliZ(0) @ qml.Hadamard(1) @ qml.PauliY(2), "PZHPY"),
+        ],
+    )
+    def test_tensor(self, theta, phi, varphi, obs, expected, tol):
         """Test that a tensor product involving PauliX and PauliY works
         correctly"""
         dev = qml.device(device_name, mpi=True, wires=3)
-        obs = qml.PauliX(0) @ qml.PauliY(2)
-
-        dev.apply(
-            [
-                qml.RX(theta, wires=[0]),
-                qml.RX(phi, wires=[1]),
-                qml.RX(varphi, wires=[2]),
-                qml.CNOT(wires=[0, 1]),
-                qml.CNOT(wires=[1, 2]),
-            ],
-            rotations=obs.diagonalizing_gates(),
-        )
-        res = dev.expval(obs)
-
-        expected = np.sin(theta) * np.sin(phi) * np.sin(varphi)
-
-        assert np.allclose(res, expected, atol=tol)
-
-    def test_pauliz_identity(self, theta, phi, varphi, tol):
-        """Test that a tensor product involving PauliZ and Identity works
-        correctly"""
-        dev = qml.device(device_name, mpi=True, wires=3)
-        obs = qml.PauliZ(0) @ qml.Identity(1) @ qml.PauliZ(2)
-
-        dev.apply(
-            [
-                qml.RX(theta, wires=[0]),
-                qml.RX(phi, wires=[1]),
-                qml.RX(varphi, wires=[2]),
-                qml.CNOT(wires=[0, 1]),
-                qml.CNOT(wires=[1, 2]),
-            ],
-            rotations=obs.diagonalizing_gates(),
-        )
-
-        res = dev.expval(obs)
-
-        expected = np.cos(varphi) * np.cos(phi)
 
-        assert np.allclose(res, expected, tol)
-
-    def test_pauliz_hadamard_pauliy(self, theta, phi, varphi, tol):
-        """Test that a tensor product involving PauliZ and PauliY and Hadamard
-        works correctly"""
-        dev = qml.device(device_name, mpi=True, wires=3)
-        obs = qml.PauliZ(0) @ qml.Hadamard(1) @ qml.PauliY(2)
-
-        dev.apply(
-            [
-                qml.RX(theta, wires=[0]),
-                qml.RX(phi, wires=[1]),
-                qml.RX(varphi, wires=[2]),
-                qml.CNOT(wires=[0, 1]),
-                qml.CNOT(wires=[1, 2]),
-            ],
-            rotations=obs.diagonalizing_gates(),
-        )
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.expval(obs)
 
-        res = dev.expval(obs)
-        expected = -(np.cos(varphi) * np.sin(phi) + np.sin(varphi) * np.cos(theta)) / np.sqrt(2)
+        mpi_qnode = qml.QNode(circuit, dev)
+        res = mpi_qnode()
+
+        if expected == "PXPY":
+            expected_val = np.sin(theta) * np.sin(phi) * np.sin(varphi)
+        elif expected == "PZIPZ":
+            expected_val = np.cos(varphi) * np.cos(phi)
+        elif expected == "PZHPY":
+            expected_val = -(
+                np.cos(varphi) * np.sin(phi) + np.sin(varphi) * np.cos(theta)
+            ) / np.sqrt(2)
+        else:
+            expected_val = 0
 
-        assert np.allclose(res, expected, tol)
+        assert np.allclose(res, expected_val, atol=tol)
diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py
new file mode 100644
index 0000000000..78bde9872a
--- /dev/null
+++ b/mpitests/test_native_mcm.py
@@ -0,0 +1,43 @@
+# Copyright 2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for default qubit preprocessing."""
+import numpy as np
+import pennylane as qml
+import pytest
+from conftest import LightningDevice, device_name
+from mpi4py import MPI
+
+if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
+
+
+def test_unspported_mid_measurement():
+    """Test unsupported mid_measurement for Lightning-GPU-MPI."""
+    comm = MPI.COMM_WORLD
+    dev = qml.device(device_name, wires=2, mpi=True, shots=1000)
+    params = np.pi / 4 * np.ones(2)
+
+    @qml.qnode(dev)
+    def func(x, y):
+        qml.RX(x, wires=0)
+        m0 = qml.measure(0)
+        qml.cond(m0, qml.RY)(y, wires=1)
+        return qml.probs(wires=0)
+
+    comm.Barrier()
+
+    with pytest.raises(
+        qml.DeviceError, match="Lightning-GPU-MPI does not support Mid-circuit measurements."
+    ):
+        func(*params)
diff --git a/mpitests/test_probs.py b/mpitests/test_probs.py
index b2f57f733a..ed9ab9b9c8 100644
--- a/mpitests/test_probs.py
+++ b/mpitests/test_probs.py
@@ -23,27 +23,31 @@
 numQubits = 8
 
 
-def create_random_init_state(numWires, R_DTYPE, seed_value=48):
+def create_random_init_state(numWires, c_dtype, seed_value=48):
+    """Returns a random initial state of a certain type."""
     np.random.seed(seed_value)
-    num_elements = 1 << numWires
-    init_state = np.random.rand(num_elements).astype(R_DTYPE) + 1j * np.random.rand(
+
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
+
+    num_elements = 2**numWires
+    init_state = np.random.rand(num_elements).astype(r_dtype) + 1j * np.random.rand(
         num_elements
-    ).astype(R_DTYPE)
-    scale_sum = np.sqrt(np.sum(np.abs(init_state) ** 2)).astype(R_DTYPE)
-    init_state = init_state / scale_sum
+    ).astype(r_dtype)
+
+    init_state = init_state / np.linalg.norm(init_state)
     return init_state
 
 
-def apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE):
+def apply_probs_nonparam(tol, operation, GateWires, Wires, c_dtype):
     num_wires = numQubits
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     commSize = comm.Get_size()
 
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
-    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     def circuit():
@@ -58,15 +62,16 @@ def circuit():
     local_probs = mpi_qnode()
 
     recv_counts = comm.gather(len(local_probs), root=0)
-
     comm.Barrier()
 
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
+
     if rank == 0:
-        probs_mpi = np.zeros(1 << len(Wires)).astype(dev_mpi.R_DTYPE)
-        displacements = [i for i in range(commSize)]
+        probs_mpi = np.zeros(2 ** len(Wires)).astype(r_dtype)
     else:
         probs_mpi = None
         probs_cpu = None
+
     comm.Barrier()
     comm.Gatherv(local_probs, [probs_mpi, recv_counts], root=0)
 
@@ -75,16 +80,16 @@ def circuit():
     comm.Barrier()
 
 
-def apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE):
+def apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype):
     num_wires = numQubits
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     commSize = comm.Get_size()
 
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
-    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     def circuit():
@@ -102,8 +107,10 @@ def circuit():
 
     comm.Barrier()
 
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
+
     if rank == 0:
-        probs_mpi = np.zeros(1 << len(Wires)).astype(dev_mpi.R_DTYPE)
+        probs_mpi = np.zeros(2 ** len(Wires)).astype(r_dtype)
     else:
         probs_mpi = None
         probs_cpu = None
@@ -116,6 +123,19 @@ def circuit():
     comm.Barrier()
 
 
+@pytest.mark.parametrize(
+    "Wires",
+    [
+        [0],
+        [1],
+        [0, 1],
+        [0, 2],
+        [0, numQubits - 1],
+        [numQubits - 2, numQubits - 1],
+        range(numQubits),
+    ],
+)
+@pytest.mark.parametrize("c_dtype", [np.complex128])
 class TestProbs:
     """Tests for the probability method."""
 
@@ -123,41 +143,15 @@ class TestProbs:
         "operation", [qml.PauliX, qml.PauliY, qml.PauliZ, qml.Hadamard, qml.S, qml.T]
     )
     @pytest.mark.parametrize("GateWires", [[0], [numQubits - 1]])
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_single_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
-        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+    def test_prob_single_wire_nonparam(self, tol, operation, GateWires, Wires, c_dtype):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.CNOT, qml.SWAP, qml.CY, qml.CZ])
     @pytest.mark.parametrize(
         "GateWires", [[0, 1], [numQubits - 2, numQubits - 1], [0, numQubits - 1]]
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_two_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
-        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+    def test_prob_two_wire_nonparam(self, tol, operation, GateWires, Wires, c_dtype):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
     @pytest.mark.parametrize(
@@ -169,80 +163,28 @@ def test_prob_two_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE)
             [0, numQubits - 2, numQubits - 1],
         ],
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_three_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
-        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+    def test_prob_three_wire_nonparam(self, tol, operation, GateWires, Wires, c_dtype):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.PhaseShift, qml.RX, qml.RY, qml.RZ])
     @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
     @pytest.mark.parametrize("GateWires", [0, numQubits - 1])
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_single_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_single_wire_param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.Rot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
     @pytest.mark.parametrize("GateWires", [0, numQubits - 1])
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_single_wire_3param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_single_wire_3param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.CRot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
     @pytest.mark.parametrize(
         "GateWires", [[0, numQubits - 1], [0, 1], [numQubits - 2, numQubits - 1]]
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_two_wire_3param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_two_wire_3param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize(
         "operation",
@@ -263,21 +205,8 @@ def test_prob_two_wire_3param(self, tol, operation, par, GateWires, Wires, C_DTY
     @pytest.mark.parametrize(
         "GateWires", [[0, numQubits - 1], [0, 1], [numQubits - 2, numQubits - 1]]
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_two_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_two_wire_param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize(
         "operation",
@@ -292,18 +221,5 @@ def test_prob_two_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYP
             [numQubits - 4, numQubits - 3, numQubits - 2, numQubits - 1],
         ],
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_four_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_four_wire_param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
diff --git a/pennylane_lightning/core/_adjoint_jacobian_base.py b/pennylane_lightning/core/_adjoint_jacobian_base.py
index 50046d5f94..a779c0cc4c 100644
--- a/pennylane_lightning/core/_adjoint_jacobian_base.py
+++ b/pennylane_lightning/core/_adjoint_jacobian_base.py
@@ -111,6 +111,7 @@ def _process_jacobian_tape(self, tape: QuantumTape, split_obs: bool = False):
             self._qubit_state.device_name, use_csingle, use_mpi, split_obs
         ).serialize_ops(tape)
 
+        # pylint: disable=not-callable
         ops_serialized = self._create_ops_list_lightning(*ops_serialized)
 
         # We need to filter out indices in trainable_params which do not
diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index 06ae878899..1e4f54b4ef 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -56,6 +56,7 @@ def __init__(
     ) -> None:
         self._qubit_state = qubit_state
 
+        self._use_mpi = False
         # Dummy for the C++ bindings
         self._measurement_lightning = None
 
@@ -94,7 +95,6 @@ def state_diagonalizing_gates(self, measurementprocess: StateMeasurement) -> Ten
         self._qubit_state.apply_operations([qml.adjoint(g) for g in reversed(diagonalizing_gates)])
         return result
 
-    # pylint: disable=protected-access
     def expval(self, measurementprocess: MeasurementProcess):
         """Expectation value of the supplied observable contained in the MeasurementProcess.
 
@@ -121,8 +121,9 @@ def expval(self, measurementprocess: MeasurementProcess):
             or (measurementprocess.obs.arithmetic_depth > 0)
             or isinstance(measurementprocess.obs.name, List)
         ):
+            # pylint: disable=protected-access
             ob_serialized = QuantumScriptSerializer(
-                self._qubit_state.device_name, self.dtype == np.complex64
+                self._qubit_state.device_name, self.dtype == np.complex64, self._use_mpi
             )._ob(measurementprocess.obs)
             return self._measurement_lightning.expval(ob_serialized)
 
@@ -134,19 +135,23 @@ def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
         Args:
-            measurementprocess (StateMeasurement): measurement to apply to the state
+            measurementprocess (StateMeasurement): measurement to apply to the state.
 
         Returns:
-            Probabilities of the supplied observable or wires
+            Probabilities of the supplied observable or wires.
         """
         diagonalizing_gates = measurementprocess.diagonalizing_gates()
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(diagonalizing_gates)
+
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
+
         return results
 
     def var(self, measurementprocess: MeasurementProcess):
@@ -175,8 +180,9 @@ def var(self, measurementprocess: MeasurementProcess):
             or (measurementprocess.obs.arithmetic_depth > 0)
             or isinstance(measurementprocess.obs.name, List)
         ):
+            # pylint: disable=protected-access
             ob_serialized = QuantumScriptSerializer(
-                self._qubit_state.device_name, self.dtype == np.complex64
+                self._qubit_state.device_name, self.dtype == np.complex64, self._use_mpi
             )._ob(measurementprocess.obs)
             return self._measurement_lightning.var(ob_serialized)
 
@@ -187,6 +193,7 @@ def var(self, measurementprocess: MeasurementProcess):
     def get_measurement_function(
         self, measurementprocess: MeasurementProcess
     ) -> Callable[[MeasurementProcess, TensorLike], TensorLike]:
+        # pylint: disable=too-many-return-statements
         """Get the appropriate method for performing a measurement.
 
         Args:
@@ -197,16 +204,24 @@ def get_measurement_function(
         """
         if isinstance(measurementprocess, StateMeasurement):
             if isinstance(measurementprocess, ExpectationMP):
-                if isinstance(measurementprocess.obs, (qml.Identity, qml.Projector)):
-                    return self.state_diagonalizing_gates
+                if self._use_mpi:
+                    if isinstance(measurementprocess.obs, (qml.Projector)):
+                        return self.state_diagonalizing_gates
+                else:
+                    if isinstance(measurementprocess.obs, (qml.Identity, qml.Projector)):
+                        return self.state_diagonalizing_gates
                 return self.expval
 
             if isinstance(measurementprocess, ProbabilityMP):
                 return self.probs
 
             if isinstance(measurementprocess, VarianceMP):
-                if isinstance(measurementprocess.obs, (qml.Identity, qml.Projector)):
-                    return self.state_diagonalizing_gates
+                if self._use_mpi:
+                    if isinstance(measurementprocess.obs, (qml.Projector)):
+                        return self.state_diagonalizing_gates
+                else:
+                    if isinstance(measurementprocess.obs, (qml.Identity, qml.Projector)):
+                        return self.state_diagonalizing_gates
                 return self.var
             if measurementprocess.obs is None or measurementprocess.obs.has_diagonalizing_gates:
                 return self.state_diagonalizing_gates
diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index 3e08a5ab40..8815e13a04 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -16,7 +16,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 from pennylane import BasisState, StatePrep
@@ -101,7 +101,7 @@ def reset_state(self):
         self._qubit_state.resetStateVector()
 
     @abstractmethod
-    def _apply_state_vector(self, state, device_wires: Wires):
+    def _apply_state_vector(self, state, device_wires: Wires, sync: Optional[bool] = None):
         """Initialize the internal state vector in a specified state.
         Args:
             state (array[complex]): normalized input state of length ``2**len(wires)``
@@ -117,6 +117,7 @@ def _apply_basis_state(self, state, wires):
                 consisting of 0s and 1s.
             wires (Wires): wires that the provided computational state should be
                 initialized on
+            use_async(Optional[bool]): immediately sync with host-sv after applying operation.
 
         Note: This function does not support broadcasted inputs yet.
         """
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 37f7ba1bbf..243d1c7ad5 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -15,5 +15,4 @@
 """Version information.
    Version number (major.minor.patch[-label])
 """
-
 __version__ = "0.39.0-dev46"
diff --git a/pennylane_lightning/core/lightning_newAPI_base.py b/pennylane_lightning/core/lightning_newAPI_base.py
index dcee73fd5c..12cdf98b4e 100644
--- a/pennylane_lightning/core/lightning_newAPI_base.py
+++ b/pennylane_lightning/core/lightning_newAPI_base.py
@@ -90,6 +90,49 @@ def c_dtype(self):
     def _set_lightning_classes(self):
         """Load the LightningStateVector, LightningMeasurements, LightningAdjointJacobian as class attribute"""
 
+    @abstractmethod
+    def _setup_execution_config(self, config):
+        """
+        Update the execution config with choices for how the device should be used and the device options.
+        """
+
+    @abstractmethod
+    def preprocess(self, execution_config: ExecutionConfig = DefaultExecutionConfig):
+        """This function defines the device transform program to be applied and an updated device configuration.
+
+        Args:
+            execution_config (Union[ExecutionConfig, Sequence[ExecutionConfig]]): A data structure describing the
+                parameters needed to fully describe the execution.
+
+        Returns:
+            TransformProgram, ExecutionConfig: A transform program that when called returns :class:`~.QuantumTape`'s that the
+            device can natively execute as well as a postprocessing function to be called after execution, and a configuration
+            with unset specifications filled in.
+
+        This device:
+
+        * Supports any qubit operations that provide a matrix
+        * Currently does not support finite shots
+        * Currently does not intrinsically support parameter broadcasting
+
+        """
+
+    @abstractmethod
+    def execute(
+        self,
+        circuits: QuantumTape_or_Batch,
+        execution_config: ExecutionConfig = DefaultExecutionConfig,
+    ) -> Result_or_ResultBatch:
+        """Execute a circuit or a batch of circuits and turn it into results.
+
+        Args:
+            circuits (Union[QuantumTape, Sequence[QuantumTape]]): the quantum circuits to be executed
+            execution_config (ExecutionConfig): a datastructure with additional information required for execution
+
+        Returns:
+            TensorLike, tuple[TensorLike], tuple[tuple[TensorLike]]: A numeric result of the computation.
+        """
+
     @abstractmethod
     def simulate(
         self,
@@ -112,6 +155,25 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
 
+    @abstractmethod
+    def supports_derivatives(
+        self,
+        execution_config: Optional[ExecutionConfig] = None,
+        circuit: Optional[qml.tape.QuantumTape] = None,
+    ) -> bool:
+        """Check whether or not derivatives are available for a given configuration and circuit.
+
+        ``LightningGPU`` supports adjoint differentiation with analytic results.
+
+        Args:
+            execution_config (ExecutionConfig): The configuration of the desired derivative calculation
+            circuit (QuantumTape): An optional circuit to check derivatives support for.
+
+        Returns:
+            Bool: Whether or not a derivative can be calculated provided the given information
+
+        """
+
     def jacobian(
         self,
         circuit: QuantumTape,
@@ -135,6 +197,7 @@ def jacobian(
             [circuit], _ = qml.map_wires(circuit, wire_map)
         state.reset_state()
         final_state = state.get_final_state(circuit)
+        # pylint: disable=not-callable
         return self.LightningAdjointJacobian(final_state, batch_obs=batch_obs).calculate_jacobian(
             circuit
         )
@@ -163,6 +226,7 @@ def simulate_and_jacobian(
         if wire_map is not None:
             [circuit], _ = qml.map_wires(circuit, wire_map)
         res = self.simulate(circuit, state)
+        # pylint: disable=not-callable
         jac = self.LightningAdjointJacobian(state, batch_obs=batch_obs).calculate_jacobian(circuit)
         return res, jac
 
@@ -193,6 +257,7 @@ def vjp(  # pylint: disable=too-many-arguments
             [circuit], _ = qml.map_wires(circuit, wire_map)
         state.reset_state()
         final_state = state.get_final_state(circuit)
+        # pylint: disable=not-callable
         return self.LightningAdjointJacobian(final_state, batch_obs=batch_obs).calculate_vjp(
             circuit, cotangents
         )
@@ -224,6 +289,7 @@ def simulate_and_vjp(  # pylint: disable=too-many-arguments
         if wire_map is not None:
             [circuit], _ = qml.map_wires(circuit, wire_map)
         res = self.simulate(circuit, state)
+        # pylint: disable=not-callable
         _vjp = self.LightningAdjointJacobian(state, batch_obs=batch_obs).calculate_vjp(
             circuit, cotangents
         )
diff --git a/pennylane_lightning/core/src/algorithms/tests/mpi/Test_AdjointJacobianMPI.cpp b/pennylane_lightning/core/src/algorithms/tests/mpi/Test_AdjointJacobianMPI.cpp
index b4e617eec4..9953b218f0 100644
--- a/pennylane_lightning/core/src/algorithms/tests/mpi/Test_AdjointJacobianMPI.cpp
+++ b/pennylane_lightning/core/src/algorithms/tests/mpi/Test_AdjointJacobianMPI.cpp
@@ -87,7 +87,6 @@ template <typename TypeList> void testAdjointJacobian() {
 
             StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                              nGlobalIndexBits, nLocalIndexBits);
-            psi.initSV();
 
             const auto obs1 = std::make_shared<NamedObsMPI<StateVectorT>>(
                 "PauliZ", std::vector<std::size_t>{0});
@@ -138,7 +137,6 @@ template <typename TypeList> void testAdjointJacobian() {
 
             StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                              nGlobalIndexBits, nLocalIndexBits);
-            psi.initSV();
 
             const auto obs1 = std::make_shared<NamedObsMPI<StateVectorT>>(
                 "PauliZ", std::vector<std::size_t>{0});
@@ -189,7 +187,6 @@ template <typename TypeList> void testAdjointJacobian() {
 
             StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                              nGlobalIndexBits, nLocalIndexBits);
-            psi.initSV();
 
             const auto obs = std::make_shared<TensorProdObsMPI<StateVectorT>>(
                 std::make_shared<NamedObsMPI<StateVectorT>>(
@@ -240,7 +237,6 @@ template <typename TypeList> void testAdjointJacobian() {
 
             StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                              nGlobalIndexBits, nLocalIndexBits);
-            psi.initSV();
 
             const auto obs = std::make_shared<TensorProdObsMPI<StateVectorT>>(
                 std::make_shared<NamedObsMPI<StateVectorT>>(
@@ -310,7 +306,6 @@ template <typename TypeList> void testAdjointJacobian() {
 
             StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                              nGlobalIndexBits, nLocalIndexBits);
-            psi.initSV();
 
             auto obs1 = std::make_shared<NamedObsMPI<StateVectorT>>(
                 "PauliZ", std::vector<std::size_t>{0});
@@ -362,7 +357,6 @@ template <typename TypeList> void testAdjointJacobian() {
 
             StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                              nGlobalIndexBits, nLocalIndexBits);
-            psi.initSV();
 
             auto obs1 = std::make_shared<TensorProdObsMPI<StateVectorT>>(
                 std::make_shared<NamedObsMPI<StateVectorT>>(
diff --git a/pennylane_lightning/core/src/measurements/MeasurementsBase.hpp b/pennylane_lightning/core/src/measurements/MeasurementsBase.hpp
index 50a76610dc..62fd82e1ab 100644
--- a/pennylane_lightning/core/src/measurements/MeasurementsBase.hpp
+++ b/pennylane_lightning/core/src/measurements/MeasurementsBase.hpp
@@ -77,7 +77,6 @@ template <class StateVectorT, class Derived> class MeasurementsBase {
     /**
      * @brief Randomly set the seed of the internal random generator
      *
-     * @param seed Seed
      */
     void setRandomSeed() {
         std::random_device rd;
diff --git a/pennylane_lightning/core/src/measurements/tests/Test_MeasurementsBase.cpp b/pennylane_lightning/core/src/measurements/tests/Test_MeasurementsBase.cpp
index 674659a9cc..f32a44a363 100644
--- a/pennylane_lightning/core/src/measurements/tests/Test_MeasurementsBase.cpp
+++ b/pennylane_lightning/core/src/measurements/tests/Test_MeasurementsBase.cpp
@@ -20,6 +20,7 @@ using Pennylane::Util::isApproxEqual;
 } // namespace
 /// @endcond
 #include <algorithm>
+#include <optional>
 #include <string>
 
 #ifdef _ENABLE_PLQUBIT
@@ -84,44 +85,33 @@ template <typename TypeList> void testProbabilities() {
         // Expected results calculated with Pennylane default.qubit:
         std::vector<
             std::pair<std::vector<std::size_t>, std::vector<PrecisionT>>>
-            input = {
-#if defined(_ENABLE_PLGPU)
-                // Bit index reodering conducted in the python layer
-                // for L-GPU. Also L-GPU backend doesn't support
-                // out of order wires for probability calculation
-                {{2, 1, 0},
-                 {0.67078706, 0.03062806, 0.0870997, 0.00397696, 0.17564072,
-                  0.00801973, 0.02280642, 0.00104134}}
-#else
-                // LightningQubit currently supports arbitrary wire index
-                // ordering.
-                {{0, 2, 1},
-                 {0.67078706, 0.0870997, 0.03062806, 0.00397696, 0.17564072,
-                  0.02280642, 0.00801973, 0.00104134}},
-                {{1, 0, 2},
-                 {0.67078706, 0.03062806, 0.17564072, 0.00801973, 0.0870997,
-                  0.00397696, 0.02280642, 0.00104134}},
-                {{1, 2, 0},
-                 {0.67078706, 0.17564072, 0.03062806, 0.00801973, 0.0870997,
-                  0.02280642, 0.00397696, 0.00104134}},
-                {{2, 0, 1},
-                 {0.67078706, 0.0870997, 0.17564072, 0.02280642, 0.03062806,
-                  0.00397696, 0.00801973, 0.00104134}},
-                {{2, 1, 0},
-                 {0.67078706, 0.17564072, 0.0870997, 0.02280642, 0.03062806,
-                  0.00801973, 0.00397696, 0.00104134}},
-                {{2, 1}, {0.84642778, 0.10990612, 0.0386478, 0.0050183}},
-                {{0, 1, 2},
-                 {0.67078706, 0.03062806, 0.0870997, 0.00397696, 0.17564072,
-                  0.00801973, 0.02280642, 0.00104134}},
-                {{0, 1}, {0.70141512, 0.09107666, 0.18366045, 0.02384776}},
-                {{0, 2}, {0.75788676, 0.03460502, 0.19844714, 0.00906107}},
-                {{1, 2}, {0.84642778, 0.0386478, 0.10990612, 0.0050183}},
-                {{0}, {0.79249179, 0.20750821}},
-                {{1}, {0.88507558, 0.11492442}},
-                {{2}, {0.9563339, 0.0436661}}
-#endif
-            };
+            input = {// LightningQubit currently supports arbitrary wire index
+                     // ordering.
+                     {{0, 2, 1},
+                      {0.67078706, 0.0870997, 0.03062806, 0.00397696,
+                       0.17564072, 0.02280642, 0.00801973, 0.00104134}},
+                     {{1, 0, 2},
+                      {0.67078706, 0.03062806, 0.17564072, 0.00801973,
+                       0.0870997, 0.00397696, 0.02280642, 0.00104134}},
+                     {{1, 2, 0},
+                      {0.67078706, 0.17564072, 0.03062806, 0.00801973,
+                       0.0870997, 0.02280642, 0.00397696, 0.00104134}},
+                     {{2, 0, 1},
+                      {0.67078706, 0.0870997, 0.17564072, 0.02280642,
+                       0.03062806, 0.00397696, 0.00801973, 0.00104134}},
+                     {{2, 1, 0},
+                      {0.67078706, 0.17564072, 0.0870997, 0.02280642,
+                       0.03062806, 0.00801973, 0.00397696, 0.00104134}},
+                     {{2, 1}, {0.84642778, 0.10990612, 0.0386478, 0.0050183}},
+                     {{0, 1, 2},
+                      {0.67078706, 0.03062806, 0.0870997, 0.00397696,
+                       0.17564072, 0.00801973, 0.02280642, 0.00104134}},
+                     {{0, 1}, {0.70141512, 0.09107666, 0.18366045, 0.02384776}},
+                     {{0, 2}, {0.75788676, 0.03460502, 0.19844714, 0.00906107}},
+                     {{1, 2}, {0.84642778, 0.0386478, 0.10990612, 0.0050183}},
+                     {{0}, {0.79249179, 0.20750821}},
+                     {{1}, {0.88507558, 0.11492442}},
+                     {{2}, {0.9563339, 0.0436661}}};
 
         // Defining the Statevector that will be measured.
         auto statevector_data = createNonTrivialState<StateVectorT>();
@@ -403,11 +393,7 @@ template <typename TypeList> void testProbabilitiesObsShots() {
             std::size_t num_shots = 10000;
             auto prob_obs_shots = Measurer_obs_shots.probs(*obs, num_shots);
 
-#ifdef _ENABLE_PLGPU
-            auto prob = Measurer.probs(std::vector<std::size_t>({2, 1, 0}));
-#else
             auto prob = Measurer.probs(std::vector<std::size_t>({0, 1, 2}));
-#endif
 
             REQUIRE_THAT(prob_obs_shots, Catch::Approx(prob).margin(5e-2));
         }
@@ -433,11 +419,7 @@ template <typename TypeList> void testProbabilitiesObsShots() {
 
             std::size_t num_shots = 10000;
             auto prob_obs_shots = Measurer_obs_shots.probs(*obs, num_shots);
-#ifdef _ENABLE_PLGPU
-            auto prob = Measurer.probs(std::vector<std::size_t>({2, 1, 0}));
-#else
             auto prob = Measurer.probs(std::vector<std::size_t>({0, 1, 2}));
-#endif
 
             REQUIRE_THAT(prob_obs_shots, Catch::Approx(prob).margin(5e-2));
         }
@@ -1251,7 +1233,9 @@ TEST_CASE("Var Shot- TensorProdObs", "[MeasurementsBase][Observables]") {
         testTensorProdObsVarShot<TestStateVectorBackends>();
     }
 }
-template <typename TypeList> void testSamples() {
+
+template <typename TypeList>
+void testSamples(const std::optional<std::size_t> &seed = std::nullopt) {
     if constexpr (!std::is_same_v<TypeList, void>) {
         using StateVectorT = typename TypeList::Type;
         using PrecisionT = typename StateVectorT::PrecisionT;
@@ -1281,7 +1265,10 @@ template <typename TypeList> void testSamples() {
         std::size_t num_qubits = 3;
         std::size_t N = std::pow(2, num_qubits);
         std::size_t num_samples = 100000;
-        auto &&samples = Measurer.generate_samples(num_samples);
+        auto &&samples =
+            seed.has_value()
+                ? Measurer.generate_samples(num_samples, seed.value())
+                : Measurer.generate_samples(num_samples);
 
         std::vector<std::size_t> counts(N, 0);
         std::vector<std::size_t> samples_decimal(num_samples, 0);
@@ -1307,7 +1294,7 @@ template <typename TypeList> void testSamples() {
             REQUIRE_THAT(probabilities,
                          Catch::Approx(expected_probabilities).margin(.05));
         }
-        testSamples<typename TypeList::Next>();
+        testSamples<typename TypeList::Next>(seed);
     }
 }
 
@@ -1317,6 +1304,12 @@ TEST_CASE("Samples", "[MeasurementsBase]") {
     }
 }
 
+TEST_CASE("Seeded samples", "[MeasurementsBase]") {
+    if constexpr (BACKEND_FOUND) {
+        testSamples<TestStateVectorBackends>(37);
+    }
+}
+
 template <typename TypeList> void testSamplesCountsObs() {
     if constexpr (!std::is_same_v<TypeList, void>) {
         using StateVectorT = typename TypeList::Type;
@@ -1729,4 +1722,4 @@ TEST_CASE("Measure Shot - SparseHObs ", "[MeasurementsBase][Observables]") {
     if constexpr (BACKEND_FOUND) {
         testSparseHObsMeasureShot<TestStateVectorBackends>();
     }
-}
\ No newline at end of file
+}
diff --git a/pennylane_lightning/core/src/measurements/tests/mpi/Test_MeasurementsBaseMPI.cpp b/pennylane_lightning/core/src/measurements/tests/mpi/Test_MeasurementsBaseMPI.cpp
index 4a90d8849d..7f6411263e 100644
--- a/pennylane_lightning/core/src/measurements/tests/mpi/Test_MeasurementsBaseMPI.cpp
+++ b/pennylane_lightning/core/src/measurements/tests/mpi/Test_MeasurementsBaseMPI.cpp
@@ -58,7 +58,7 @@ template <typename TypeList> void testProbabilities() {
             input = {// Bit index reodering conducted in the python layer
                      // for L-GPU. Also L-GPU backend doesn't support
                      // out of order wires for probability calculation
-                     {{2, 1, 0},
+                     {{0, 1, 2},
                       {0.67078706, 0.03062806, 0.0870997, 0.00397696,
                        0.17564072, 0.00801973, 0.02280642, 0.00104134}}};
 
@@ -386,7 +386,7 @@ template <typename TypeList> void testProbabilitiesObsShots() {
 
             std::size_t num_shots = 10000;
             auto prob_obs_shots = Measurer_obs_shots.probs(*obs, num_shots);
-            auto prob = Measurer.probs(std::vector<std::size_t>({2, 1, 0}));
+            auto prob = Measurer.probs(std::vector<std::size_t>({0, 1, 2}));
             auto prob_all = mpi_manager.allgather(prob);
             REQUIRE_THAT(prob_obs_shots, Catch::Approx(prob_all).margin(5e-2));
         }
diff --git a/pennylane_lightning/core/src/observables/tests/mpi/Test_ObservablesMPI.cpp b/pennylane_lightning/core/src/observables/tests/mpi/Test_ObservablesMPI.cpp
index eb39b57f5c..30f7262349 100644
--- a/pennylane_lightning/core/src/observables/tests/mpi/Test_ObservablesMPI.cpp
+++ b/pennylane_lightning/core/src/observables/tests/mpi/Test_ObservablesMPI.cpp
@@ -522,7 +522,6 @@ template <typename TypeList> void testHamiltonianBase() {
 
                 StateVectorT sv_mpi(mpi_manager, dt_local, mpi_buffersize,
                                     nGlobalIndexBits, nLocalIndexBits);
-                sv_mpi.initSV();
 
                 REQUIRE_THROWS_AS(ham->applyInPlace(sv_mpi),
                                   LightningException);
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaBase.hpp
index 90b03961e7..772d1b6a2c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaBase.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaBase.hpp
@@ -198,16 +198,6 @@ class StateVectorCudaBase : public StateVectorBase<Precision, Derived> {
         data_buffer_ = std::move(other);
     }
 
-    /**
-     * @brief Initialize the statevector data to the |0...0> state
-     *
-     */
-    void initSV(bool async = false) {
-        std::size_t index = 0;
-        const std::complex<Precision> value(1, 0);
-        static_cast<Derived *>(this)->setBasisState(value, index, async);
-    };
-
   protected:
     using ParFunc = std::function<void(const std::vector<std::size_t> &, bool,
                                        const std::vector<Precision> &)>;
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 3753f792fd..964c5e69ce 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -119,6 +119,7 @@ class StateVectorCudaMPI final
               handle_.get(), mpi_manager_, mpi_buf_size, BaseType::getData(),
               num_local_qubits, localStream_.get())),
           gate_cache_(true, dev_tag) {
+        resetStateVector();
         PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
         mpi_manager_.Barrier();
     };
@@ -137,6 +138,7 @@ class StateVectorCudaMPI final
               handle_.get(), mpi_manager_, mpi_buf_size, BaseType::getData(),
               num_local_qubits, localStream_.get())),
           gate_cache_(true, dev_tag) {
+        resetStateVector();
         PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
         mpi_manager_.Barrier();
     };
@@ -155,6 +157,7 @@ class StateVectorCudaMPI final
               handle_.get(), mpi_manager_, mpi_buf_size, BaseType::getData(),
               num_local_qubits, localStream_.get())),
           gate_cache_(true, dev_tag) {
+        resetStateVector();
         PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
         mpi_manager_.Barrier();
     };
@@ -193,7 +196,7 @@ class StateVectorCudaMPI final
               handle_.get(), mpi_manager_, 0, BaseType::getData(),
               num_local_qubits, localStream_.get())),
           gate_cache_(true, dev_tag) {
-        BaseType::initSV();
+        resetStateVector();
         PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
         mpi_manager_.Barrier();
     }
@@ -251,92 +254,83 @@ class StateVectorCudaMPI final
     }
 
     /**
-     * @brief Set value for a single element of the state-vector on device. This
-     * method is implemented by cudaMemcpy.
-     *
-     * @param value Value to be set for the target element.
-     * @param index Index of the target element.
-     * @param async Use an asynchronous memory copy.
+     * @brief the statevector data to the |0...0> state.
+     * @param use_async Use an asynchronous memory copy or not. Default is
+     * false.
      */
-    void setBasisState(const std::complex<Precision> &value,
-                       const std::size_t index, const bool async = false) {
-        std::size_t rankId = index >> BaseType::getNumQubits();
-
-        std::size_t local_index =
-            static_cast<std::size_t>(
-                rankId * std::pow(2.0, static_cast<long double>(
-                                           BaseType::getNumQubits()))) ^
-            index;
+    void resetStateVector(bool use_async = false) {
         BaseType::getDataBuffer().zeroInit();
+        std::size_t index = 0;
+        ComplexT value(1.0, 0.0);
+        setBasisState_(value, index, use_async);
+    };
 
-        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
-        auto stream_id = localStream_.get();
+    /**
+     * @brief Prepare a single computational basis state.
+     *
+     * @param state Binary number representing the index
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy.
+     */
+    void setBasisState(const std::vector<std::size_t> &state,
+                       const std::vector<std::size_t> &wires,
+                       const bool use_async) {
+        PL_ABORT_IF_NOT(state.size() == wires.size(),
+                        "state and wires must have equal dimensions.");
 
-        if (mpi_manager_.getRank() == rankId) {
-            setBasisState_CUDA(BaseType::getData(), value_cu, local_index,
-                               async, stream_id);
+        const auto n_wires = this->getTotalNumQubits();
+
+        std::size_t index{0U};
+        for (std::size_t k = 0; k < n_wires; k++) {
+            index |= state[k] << (n_wires - 1 - wires[k]);
         }
-        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
-        mpi_manager_.Barrier();
+
+        const std::complex<PrecisionT> value(1.0, 0.0);
+        BaseType::getDataBuffer().zeroInit();
+        setBasisState_(value, index, use_async);
     }
 
     /**
-     * @brief Set values for a batch of elements of the state-vector. This
-     * method is implemented by the customized CUDA kernel defined in the
-     * DataBuffer class.
+     * @brief Set values for a batch of elements of the state-vector.
      *
-     * @param num_indices Number of elements to be passed to the state vector.
-     * @param values Pointer to values to be set for the target elements.
-     * @param indices Pointer to indices of the target elements.
-     * @param async Use an asynchronous memory copy.
+     * @param state_ptr Pointer to initial state data.
+     * @param num_states Length of initial state data.
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy. Default is false.
      */
-    template <class index_type, std::size_t thread_per_block = 256>
-    void setStateVector(const index_type num_indices,
-                        const std::complex<Precision> *values,
-                        const index_type *indices, const bool async = false) {
-        BaseType::getDataBuffer().zeroInit();
-
-        std::vector<index_type> indices_local;
-        std::vector<std::complex<Precision>> values_local;
-
-        for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
-             i++) {
-            int index = indices[i];
-            PL_ASSERT(index >= 0);
-            std::size_t rankId =
-                static_cast<std::size_t>(index) >> BaseType::getNumQubits();
-
-            if (rankId == mpi_manager_.getRank()) {
-                int local_index =
-                    static_cast<std::size_t>(
-                        rankId * std::pow(2.0, static_cast<long double>(
-                                                   BaseType::getNumQubits()))) ^
-                    index;
-                indices_local.push_back(local_index);
-                values_local.push_back(values[i]);
+    void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
+                        const std::vector<std::size_t> &wires,
+                        bool use_async = false) {
+        PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
+                        "Inconsistent state and wires dimensions.");
+
+        const auto num_qubits = this->getTotalNumQubits();
+
+        PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
+                                     [&num_qubits](const auto i) {
+                                         return i >= num_qubits;
+                                     }) == wires.end(),
+                        "Invalid wire index.");
+
+        using index_type =
+            typename std::conditional<std::is_same<PrecisionT, float>::value,
+                                      int32_t, int64_t>::type;
+
+        // Calculate the indices of the state-vector to be set.
+        // TODO: Could move to GPU/MPI calculation if the state size is large.
+        std::vector<index_type> indices(num_states);
+        const std::size_t num_wires = wires.size();
+        constexpr std::size_t one{1U};
+        for (std::size_t i = 0; i < num_states; i++) {
+            std::size_t index{0U};
+            for (std::size_t j = 0; j < num_wires; j++) {
+                const std::size_t bit = (i & (one << j)) >> j;
+                index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
             }
+            indices[i] = static_cast<index_type>(index);
         }
-
-        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-
-        index_type num_elements = indices_local.size();
-
-        DataBuffer<index_type, int> d_indices{
-            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
-
-        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
-                                        device_id, stream_id, true};
-
-        d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
-                                    async);
-        d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
-                                   async);
-
-        setStateVector_CUDA(BaseType::getData(), num_elements,
-                            d_values.getData(), d_indices.getData(),
-                            thread_per_block, stream_id);
-        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        setStateVector_<index_type>(num_states, state_ptr, indices.data(),
+                                    use_async);
         mpi_manager_.Barrier();
     }
 
@@ -405,6 +399,19 @@ class StateVectorCudaMPI final
                     cuGates::getRot<CFP_t>(params[0], params[1], params[2]);
                 applyDeviceMatrixGate(rot_matrix.data(), ctrls, tgts, false);
             }
+        } else if (opName == "Matrix") {
+            DataBuffer<CFP_t, int> d_matrix{
+                gate_matrix.size(), BaseType::getDataBuffer().getDevTag(),
+                true};
+            d_matrix.CopyHostDataToGpu(gate_matrix.data(), d_matrix.getLength(),
+                                       false);
+            // ensure wire indexing correctly preserved for tensor-observables
+            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
+                                                       ctrls.rend()};
+            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
+                                                      tgts.rend()};
+            applyDeviceMatrixGate(d_matrix.getData(), ctrls_local, tgts_local,
+                                  adjoint);
         } else if (par_gates_.find(opName) != par_gates_.end()) {
             par_gates_.at(opName)(wires, adjoint, params);
         } else { // No offloadable function call; defer to matrix passing
@@ -484,7 +491,7 @@ class StateVectorCudaMPI final
                      const std::vector<std::size_t> &wires,
                      bool adjoint = false) {
         PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0");
-        const std::string opName = {};
+        const std::string opName = "Matrix";
         std::size_t n = std::size_t{1} << wires.size();
         const std::vector<std::complex<PrecisionT>> matrix(gate_matrix,
                                                            gate_matrix + n * n);
@@ -1528,6 +1535,88 @@ class StateVectorCudaMPI final
         return t_indices;
     }
 
+    /**
+     * @brief Set values for a batch of elements of the state-vector. This
+     * method is implemented by the customized CUDA kernel defined in the
+     * DataBuffer class.
+     *
+     * @param num_indices Number of elements to be passed to the state vector.
+     * @param values Pointer to values to be set for the target elements.
+     * @param indices Pointer to indices of the target elements.
+     * @param async Use an asynchronous memory copy.
+     */
+    template <class index_type, std::size_t thread_per_block = 256>
+    void setStateVector_(const index_type num_indices,
+                         const std::complex<Precision> *values,
+                         const index_type *indices, const bool async = false) {
+        BaseType::getDataBuffer().zeroInit();
+
+        std::vector<index_type> indices_local;
+        std::vector<std::complex<Precision>> values_local;
+
+        for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
+             i++) {
+            int index = indices[i];
+            PL_ASSERT(index >= 0);
+            std::size_t rankId =
+                static_cast<std::size_t>(index) >> BaseType::getNumQubits();
+
+            if (rankId == mpi_manager_.getRank()) {
+                int local_index = static_cast<int>(
+                    compute_local_index(static_cast<std::size_t>(index),
+                                        this->getNumLocalQubits()));
+                indices_local.push_back(local_index);
+                values_local.push_back(values[i]);
+            }
+        }
+
+        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+
+        index_type num_elements = indices_local.size();
+
+        DataBuffer<index_type, int> d_indices{
+            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
+
+        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
+                                        device_id, stream_id, true};
+
+        d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
+                                    async);
+        d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
+                                   async);
+
+        setStateVector_CUDA(BaseType::getData(), num_elements,
+                            d_values.getData(), d_indices.getData(),
+                            thread_per_block, stream_id);
+    }
+
+    /**
+     * @brief Set value for a single element of the state-vector on device. This
+     * method is implemented by cudaMemcpy.
+     *
+     * @param value Value to be set for the target element.
+     * @param index Index of the target element.
+     * @param async Use an asynchronous memory copy.
+     */
+    void setBasisState_(const std::complex<Precision> &value,
+                        const std::size_t index, const bool async = false) {
+        const std::size_t rankId = index >> this->getNumLocalQubits();
+
+        const std::size_t local_index =
+            compute_local_index(index, this->getNumLocalQubits());
+
+        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
+        auto stream_id = localStream_.get();
+
+        if (mpi_manager_.getRank() == rankId) {
+            setBasisState_CUDA(BaseType::getData(), value_cu, local_index,
+                               async, stream_id);
+        }
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
+    }
+
     /**
      * @brief Get expectation value for a sum of Pauli words.
      *
@@ -1591,8 +1680,8 @@ class StateVectorCudaMPI final
     }
 
     /**
-     * @brief Apply parametric Pauli gates to local statevector using custateVec
-     * calls.
+     * @brief Apply parametric Pauli gates to local statevector using
+     * custateVec calls.
      *
      * @param pauli_words List of Pauli words representing operation.
      * @param ctrls Control wires
@@ -1662,7 +1751,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
@@ -1822,7 +1912,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
@@ -1963,7 +2054,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 716d95c89f..f5aeb4abb6 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -111,7 +111,10 @@ class StateVectorCudaManaged
         : StateVectorCudaBase<Precision, StateVectorCudaManaged<Precision>>(
               num_qubits),
           handle_(make_shared_cusv_handle()),
-          cublascaller_(make_shared_cublas_caller()), gate_cache_(true){};
+          cublascaller_(make_shared_cublas_caller()), gate_cache_(true) {
+        resetStateVector();
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+    };
 
     StateVectorCudaManaged(
         std::size_t num_qubits, const DevTag<int> &dev_tag, bool alloc = true,
@@ -124,7 +127,8 @@ class StateVectorCudaManaged
           cublascaller_(std::move(cublascaller_in)),
           cusparsehandle_(std::move(cusparsehandle_in)),
           gate_cache_(true, dev_tag) {
-        BaseType::initSV();
+        resetStateVector();
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
     };
 
     StateVectorCudaManaged(const CFP_t *gpu_data, std::size_t length)
@@ -167,54 +171,90 @@ class StateVectorCudaManaged
     ~StateVectorCudaManaged() = default;
 
     /**
-     * @brief Set value for a single element of the state-vector on device. This
-     * method is implemented by cudaMemcpy.
-     *
-     * @param value Value to be set for the target element.
-     * @param index Index of the target element.
-     * @param async Use an asynchronous memory copy.
+     * @brief the statevector data to the |0...0> state.
+     * @param use_async Use an asynchronous memory copy or not. Default is
+     * false.
      */
-    void setBasisState(const std::complex<Precision> &value,
-                       const std::size_t index, const bool async = false) {
+    void resetStateVector(bool use_async = false) {
         BaseType::getDataBuffer().zeroInit();
-
-        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-        setBasisState_CUDA(BaseType::getData(), value_cu, index, async,
-                           stream_id);
-    }
+        std::size_t index = 0;
+        ComplexT value(1.0, 0.0);
+        setBasisState_(value, index, use_async);
+    };
 
     /**
-     * @brief Set values for a batch of elements of the state-vector. This
-     * method is implemented by the customized CUDA kernel defined in the
-     * DataBuffer class.
+     * @brief Prepare a single computational basis state.
      *
-     * @param num_indices Number of elements to be passed to the state vector.
-     * @param values Pointer to values to be set for the target elements.
-     * @param indices Pointer to indices of the target elements.
-     * @param async Use an asynchronous memory copy.
+     * @param state Binary number representing the index
+     * @param wires Wires.
+     * @param use_async(Optional[bool]): immediately sync with host-sv after
+     * applying operation.
      */
-    template <class index_type, std::size_t thread_per_block = 256>
-    void setStateVector(const index_type num_indices,
-                        const std::complex<Precision> *values,
-                        const index_type *indices, const bool async = false) {
-        BaseType::getDataBuffer().zeroInit();
+    void setBasisState(const std::vector<std::size_t> &state,
+                       const std::vector<std::size_t> &wires,
+                       const bool use_async = false) {
+        PL_ABORT_IF_NOT(state.size() == wires.size(),
+                        "state and wires must have equal dimensions.");
+        const auto num_qubits = BaseType::getNumQubits();
+        PL_ABORT_IF_NOT(
+            std::find_if(wires.begin(), wires.end(),
+                         [&num_qubits](const auto i) {
+                             return i >= num_qubits;
+                         }) == wires.end(),
+            "wires must take values lower than the number of qubits.");
+        const auto n_wires = wires.size();
+        std::size_t index{0U};
+        for (std::size_t k = 0; k < n_wires; k++) {
+            index |= state[k] << (num_qubits - 1 - wires[k]);
+        }
 
-        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+        const std::complex<PrecisionT> value(1.0, 0.0);
 
-        index_type num_elements = num_indices;
-        DataBuffer<index_type, int> d_indices{
-            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
-        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
-                                        device_id, stream_id, true};
-
-        d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
-        d_values.CopyHostDataToGpu(values, d_values.getLength(), async);
+        BaseType::getDataBuffer().zeroInit();
+        setBasisState_(value, index, use_async);
+    }
 
-        setStateVector_CUDA(BaseType::getData(), num_elements,
-                            d_values.getData(), d_indices.getData(),
-                            thread_per_block, stream_id);
+    /**
+     * @brief Set values for a batch of elements of the state-vector.
+     *
+     * @param state_ptr Pointer to the initial state data.
+     * @param num_states Length of the initial state data.
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy. Default is false.
+     */
+    void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
+                        const std::vector<std::size_t> &wires,
+                        bool use_async = false) {
+        PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
+                        "Inconsistent state and wires dimensions.");
+
+        const auto num_qubits = BaseType::getNumQubits();
+
+        PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
+                                     [&num_qubits](const auto i) {
+                                         return i >= num_qubits;
+                                     }) == wires.end(),
+                        "Invalid wire index.");
+
+        using index_type =
+            typename std::conditional<std::is_same<PrecisionT, float>::value,
+                                      int32_t, int64_t>::type;
+
+        // Calculate the indices of the state-vector to be set.
+        // TODO: Could move to GPU calculation if the state size is large.
+        std::vector<index_type> indices(num_states);
+        const std::size_t num_wires = wires.size();
+        constexpr std::size_t one{1U};
+        for (std::size_t i = 0; i < num_states; i++) {
+            std::size_t index{0U};
+            for (std::size_t j = 0; j < num_wires; j++) {
+                const std::size_t bit = (i & (one << j)) >> j;
+                index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
+            }
+            indices[i] = static_cast<index_type>(index);
+        }
+        setStateVector_<index_type>(num_states, state_ptr, indices.data(),
+                                    use_async);
     }
 
     /**
@@ -324,6 +364,19 @@ class StateVectorCudaManaged
                     cuGates::getRot<CFP_t>(params[0], params[1], params[2]);
                 applyDeviceMatrixGate(rot_matrix.data(), ctrls, tgts, false);
             }
+        } else if (opName == "Matrix") {
+            DataBuffer<CFP_t, int> d_matrix{
+                gate_matrix.size(), BaseType::getDataBuffer().getDevTag(),
+                true};
+            d_matrix.CopyHostDataToGpu(gate_matrix.data(), d_matrix.getLength(),
+                                       false);
+            // ensure wire indexing correctly preserved for tensor-observables
+            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
+                                                       ctrls.rend()};
+            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
+                                                      tgts.rend()};
+            applyDeviceMatrixGate(d_matrix.getData(), ctrls_local, tgts_local,
+                                  adjoint);
         } else if (par_gates_.find(opName) != par_gates_.end()) {
             par_gates_.at(opName)(wires, adjoint, params);
         } else { // No offloadable function call; defer to matrix passing
@@ -403,7 +456,7 @@ class StateVectorCudaManaged
                      const std::vector<std::size_t> &wires,
                      bool adjoint = false) {
         PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0");
-        const std::string opName = {};
+        const std::string opName = "Matrix";
         std::size_t n = std::size_t{1} << wires.size();
         const std::vector<std::complex<PrecisionT>> matrix(gate_matrix,
                                                            gate_matrix + n * n);
@@ -434,6 +487,56 @@ class StateVectorCudaManaged
         applyMatrix(gate_matrix.data(), wires, adjoint);
     }
 
+    /**
+     * @brief Collapse the state vector after having measured one of the qubit.
+     *
+     * Note: The branch parameter imposes the measurement result on the given
+     * wire.
+     *
+     * @param wire Wire to measure.
+     * @param branch Branch 0 or 1.
+     */
+    void collapse(std::size_t wire, bool branch) {
+        PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index.");
+        cudaDataType_t data_type;
+
+        if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                      std::is_same_v<CFP_t, double2>) {
+            data_type = CUDA_C_64F;
+        } else {
+            data_type = CUDA_C_32F;
+        }
+
+        std::vector<int> basisBits(1, BaseType::getNumQubits() - 1 - wire);
+
+        double abs2sum0;
+        double abs2sum1;
+
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* double * */ &abs2sum0,
+            /* double * */ &abs2sum1,
+            /* const int32_t * */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size()));
+
+        const double norm = branch ? abs2sum1 : abs2sum0;
+
+        const int parity = static_cast<int>(branch);
+
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* const int32_t parity */ parity,
+            /* const int32_t *basisBits */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size(),
+            /* double norm */ norm));
+    }
+
     //****************************************************************************//
     // Explicit gate calls for bindings
     //****************************************************************************//
@@ -1303,6 +1406,55 @@ class StateVectorCudaManaged
         return t_indices;
     }
 
+    /** @brief Set value for a single element of the state-vector on device.
+     * This method is implemented by cudaMemcpy.
+     *
+     * @param value Value to be set for the target element.
+     * @param index Index of the target element.
+     * @param async Use an asynchronous memory copy.
+     */
+    void setBasisState_(const std::complex<Precision> &value,
+                        const std::size_t index, const bool async = false) {
+        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+        setBasisState_CUDA(BaseType::getData(), value_cu, index, async,
+                           stream_id);
+    }
+
+    /**
+     * @brief Set values for a batch of elements of the state-vector. This
+     * method is implemented by the customized CUDA kernel defined in the
+     * DataBuffer class.
+     *
+     * @param num_indices Number of elements to be passed to the state vector.
+     * @param values Pointer to values to be set for the target elements.
+     * @param indices Pointer to indices of the target elements.
+     * @param async Use an asynchronous memory copy.
+     */
+    template <class index_type, std::size_t thread_per_block = 256>
+    void setStateVector_(const index_type num_indices,
+                         const std::complex<Precision> *values,
+                         const index_type *indices, const bool async = false) {
+        BaseType::getDataBuffer().zeroInit();
+
+        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+
+        index_type num_elements = num_indices;
+        DataBuffer<index_type, int> d_indices{
+            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
+        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
+                                        device_id, stream_id, true};
+
+        d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
+        d_values.CopyHostDataToGpu(values, d_values.getLength(), async);
+
+        setStateVector_CUDA(BaseType::getData(), num_elements,
+                            d_values.getData(), d_indices.getData(),
+                            thread_per_block, stream_id);
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+    }
+
     /**
      * @brief Apply parametric Pauli gates using custateVec calls.
      *
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/tests/Test_AdjointJacobianGPU.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/tests/Test_AdjointJacobianGPU.cpp
index 30109d64e2..ccdcedaea2 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/tests/Test_AdjointJacobianGPU.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/tests/Test_AdjointJacobianGPU.cpp
@@ -50,7 +50,6 @@ TEST_CASE("AdjointJacobianGPU::AdjointJacobianGPU Op=RX, Obs=Z",
             auto ops = OpsData<StateVectorT>({"RX"}, {{p}}, {{0}}, {false});
 
             StateVectorT psi(num_qubits);
-            psi.initSV();
 
             JacobianData<StateVectorT> tape{
                 param.size(), psi.getLength(), psi.getData(), {obs}, ops, tp};
@@ -80,7 +79,6 @@ TEST_CASE("AdjointJacobianGPU::adjointJacobian Op=RY, Obs=X",
             auto ops = OpsData<StateVectorT>({"RY"}, {{p}}, {{0}}, {false});
 
             StateVectorT psi(num_qubits);
-            psi.initSV();
 
             JacobianData<StateVectorT> tape{
                 param.size(), psi.getLength(), psi.getData(), {obs}, ops, tp};
@@ -112,7 +110,6 @@ TEST_CASE("AdjointJacobianGPU::adjointJacobian Op=[QubitStateVector, "
             {0.0, 0.0}, {1.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObs<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -150,7 +147,6 @@ TEST_CASE("AdjointJacobianGPU::adjointJacobian Op=RX, Obs=[Z,Z]",
         std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObs<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -183,7 +179,6 @@ TEST_CASE("AdjointJacobianGPU::AdjointJacobianGPU Op=[RX,RX,RX], Obs=[Z,Z,Z]",
         std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObs<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -225,7 +220,6 @@ TEST_CASE("AdjointJacobianGPU::AdjointJacobianGPU Op=[RX,RX,RX], Obs=[Z,Z,Z],"
         std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObs<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -265,7 +259,6 @@ TEST_CASE("Algorithms::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
         std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         const auto obs = std::make_shared<TensorProdObs<StateVectorT>>(
             std::make_shared<NamedObs<StateVectorT>>(
@@ -304,7 +297,6 @@ TEST_CASE("AdjointJacobianGPU::adjointJacobian Op=Mixed, Obs=[XXX]",
         std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         const auto obs = std::make_shared<TensorProdObs<StateVectorT>>(
             std::make_shared<NamedObs<StateVectorT>>(
@@ -553,7 +545,6 @@ TEST_CASE("Algorithms::adjointJacobian Op=RX, Obs=Ham[Z0+Z1]", "[Algorithms]") {
         std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObs<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -588,7 +579,6 @@ TEST_CASE(
         std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         auto obs1 = std::make_shared<NamedObs<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -630,7 +620,6 @@ TEST_CASE("AdjointJacobianGPU::AdjointJacobianGPU Test HermitianObs",
         std::vector<double> jacobian2(num_obs * tp.size(), 0);
 
         StateVectorT psi(num_qubits);
-        psi.initSV();
 
         auto obs1 = std::make_shared<TensorProdObs<StateVectorT>>(
             std::make_shared<NamedObs<StateVectorT>>(
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/tests/mpi/Test_AdjointJacobianGPUMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/tests/mpi/Test_AdjointJacobianGPUMPI.cpp
index 8cf5a487ee..3d0e6cab7c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/tests/mpi/Test_AdjointJacobianGPUMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/tests/mpi/Test_AdjointJacobianGPUMPI.cpp
@@ -84,7 +84,6 @@ TEST_CASE("AdjointJacobianGPUMPI::adjointJacobianMPI Op=RX, Obs=[Z,Z]",
     {
         StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                          nGlobalIndexBits, nLocalIndexBits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObsMPI<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -144,7 +143,6 @@ TEST_CASE("AdjointJacobianGPUMPI::adjointJacobianMPI Op=[QubitStateVector, "
     {
         StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                          nGlobalIndexBits, nLocalIndexBits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObsMPI<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -200,7 +198,6 @@ TEST_CASE(
     {
         StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                          nGlobalIndexBits, nLocalIndexBits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObsMPI<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -269,7 +266,6 @@ TEST_CASE(
     {
         StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                          nGlobalIndexBits, nLocalIndexBits);
-        psi.initSV();
 
         const auto obs1 = std::make_shared<NamedObsMPI<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
@@ -334,7 +330,6 @@ TEST_CASE("AdjointJacobianGPUMPI::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
     {
         StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                          nGlobalIndexBits, nLocalIndexBits);
-        psi.initSV();
 
         const auto obs = std::make_shared<TensorProdObsMPI<StateVectorT>>(
             std::make_shared<NamedObsMPI<StateVectorT>>(
@@ -397,7 +392,6 @@ TEST_CASE("AdjointJacobianGPUMPI::adjointJacobian Op=Mixed, Obs=[XXX]",
     {
         StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                          nGlobalIndexBits, nLocalIndexBits);
-        psi.initSV();
 
         const auto obs = std::make_shared<TensorProdObsMPI<StateVectorT>>(
             std::make_shared<NamedObsMPI<StateVectorT>>(
@@ -478,8 +472,6 @@ TEST_CASE("AdjointJacobianGPU::AdjointJacobianGPUMPI Op=[RX,RX,RX], "
     {
         StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                          nGlobalIndexBits, nLocalIndexBits);
-        psi.initSV();
-
         auto obs1 = std::make_shared<NamedObsMPI<StateVectorT>>(
             "PauliZ", std::vector<std::size_t>{0});
         auto obs2 = std::make_shared<NamedObsMPI<StateVectorT>>(
@@ -548,7 +540,6 @@ TEST_CASE("AdjointJacobianGPU::AdjointJacobianGPU Test HermitianObs",
     {
         StateVectorT psi(mpi_manager, dt_local, mpi_buffersize,
                          nGlobalIndexBits, nLocalIndexBits);
-        psi.initSV();
 
         auto obs1 = std::make_shared<TensorProdObsMPI<StateVectorT>>(
             std::make_shared<NamedObsMPI<StateVectorT>>(
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index 5bd92b5520..145097b30e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -63,10 +63,6 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
     using ParamT = PrecisionT;        // Parameter's data precision
     using np_arr_c = py::array_t<std::complex<ParamT>,
                                  py::array::c_style | py::array::forcecast>;
-    using np_arr_sparse_ind = typename std::conditional<
-        std::is_same<ParamT, float>::value,
-        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
-        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
 
     registerGatesForStateVector<StateVectorT>(pyclass);
 
@@ -83,28 +79,23 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
         }))
         .def(
             "setBasisState",
-            [](StateVectorT &sv, const std::size_t index,
-               const bool use_async) {
-                const std::complex<PrecisionT> value(1, 0);
-                sv.setBasisState(value, index, use_async);
-            },
-            "Create Basis State on GPU.")
+            [](StateVectorT &sv, const std::vector<std::size_t> &state,
+               const std::vector<std::size_t> &wires,
+               const bool async) { sv.setBasisState(state, wires, async); },
+            py::arg("state") = nullptr, py::arg("wires") = nullptr,
+            py::arg("async") = false,
+            "Set the state vector to a basis state on GPU.")
         .def(
             "setStateVector",
-            [](StateVectorT &sv, const np_arr_sparse_ind &indices,
-               const np_arr_c &state, const bool use_async) {
-                using index_type = typename std::conditional<
-                    std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
-
-                sv.template setStateVector<index_type>(
-                    static_cast<index_type>(indices.request().size),
-                    static_cast<std::complex<PrecisionT> *>(
-                        state.request().ptr),
-                    static_cast<index_type *>(indices.request().ptr),
-                    use_async);
+            [](StateVectorT &sv, const np_arr_c &state,
+               const std::vector<std::size_t> &wires, const bool async) {
+                const auto state_buffer = state.request();
+                const auto state_ptr =
+                    static_cast<const std::complex<ParamT> *>(state_buffer.ptr);
+                sv.setStateVector(state_ptr, state_buffer.size, wires, async);
             },
-            "Set State Vector on GPU with values and their corresponding "
-            "indices for the state vector on device")
+            "Set State Vector on GPU with values for the state vector and "
+            "wires on the host memory.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
@@ -152,7 +143,15 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
              "Get the GPU index for the statevector data.")
         .def("numQubits", &StateVectorT::getNumQubits)
         .def("dataLength", &StateVectorT::getLength)
-        .def("resetGPU", &StateVectorT::initSV)
+        .def(
+            "resetStateVector",
+            [](StateVectorT &gpu_sv, bool async) {
+                gpu_sv.resetStateVector(async);
+            },
+            py::arg("async") = false,
+            "Initialize the statevector data to the |0...0> state")
+        .def("collapse", &StateVectorT::collapse,
+             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 620fd93868..2d3313f694 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -63,10 +63,6 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
     using ParamT = PrecisionT;        // Parameter's data precision
     using np_arr_c = py::array_t<std::complex<ParamT>,
                                  py::array::c_style | py::array::forcecast>;
-    using np_arr_sparse_ind = typename std::conditional<
-        std::is_same<ParamT, float>::value,
-        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
-        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
 
     registerGatesForStateVector<StateVectorT>(pyclass);
 
@@ -86,28 +82,24 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             })) // qubits, device
         .def(
             "setBasisState",
-            [](StateVectorT &sv, const std::size_t index,
-               const bool use_async) {
-                const std::complex<PrecisionT> value(1, 0);
-                sv.setBasisState(value, index, use_async);
+            [](StateVectorT &sv, const std::vector<std::size_t> &state,
+               const std::vector<std::size_t> &wires, const bool use_async) {
+                sv.setBasisState(state, wires, use_async);
             },
-            "Create Basis State on GPU.")
+            py::arg("state") = nullptr, py::arg("wires") = nullptr,
+            py::arg("async") = false,
+            "Set the state vector to a basis state on GPU.")
         .def(
             "setStateVector",
-            [](StateVectorT &sv, const np_arr_sparse_ind &indices,
-               const np_arr_c &state, const bool use_async) {
-                using index_type = typename std::conditional<
-                    std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
-
-                sv.template setStateVector<index_type>(
-                    static_cast<index_type>(indices.request().size),
-                    static_cast<std::complex<PrecisionT> *>(
-                        state.request().ptr),
-                    static_cast<index_type *>(indices.request().ptr),
-                    use_async);
+            [](StateVectorT &sv, const np_arr_c &state,
+               const std::vector<std::size_t> &wires, const bool async) {
+                const auto state_buffer = state.request();
+                const auto state_ptr =
+                    static_cast<const std::complex<ParamT> *>(state_buffer.ptr);
+                sv.setStateVector(state_ptr, state_buffer.size, wires, async);
             },
-            "Set State Vector on GPU with values and their corresponding "
-            "indices for the state vector on device")
+            "Set State Vector on GPU with values for the state vector and "
+            "wires on the host memory.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
@@ -155,7 +147,13 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
              "Get the GPU index for the statevector data.")
         .def("numQubits", &StateVectorT::getNumQubits)
         .def("dataLength", &StateVectorT::getLength)
-        .def("resetGPU", &StateVectorT::initSV)
+        .def(
+            "resetStateVector",
+            [](StateVectorT &gpu_sv, bool use_async) {
+                gpu_sv.resetStateVector(use_async);
+            },
+            py::arg("async") = false,
+            "Initialize the statevector data to the |0...0> state")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
index b552ef5f01..af864d8b01 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
@@ -16,6 +16,7 @@
 #include <complex>
 #include <iostream>
 #include <limits>
+#include <numeric>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -74,7 +75,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::StateVectorCudaManaged",
                                            {0, 0}, {0, 0}, {0, 0}, {0, 0}};
         SECTION("GPU <-> host data: std::complex") {
             StateVectorCudaManaged<TestType> sv{num_qubits};
-            sv.initSV();
             std::vector<cp_t> out_data(Pennylane::Util::exp2(num_qubits),
                                        {0.5, 0.5});
             std::vector<cp_t> ref_data(Pennylane::Util::exp2(num_qubits),
@@ -100,7 +100,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyHadamard",
         SECTION("Apply directly") {
             for (std::size_t index = 0; index < num_qubits; index++) {
                 StateVectorCudaManaged<TestType> sv{num_qubits};
-                sv.initSV();
                 CHECK(sv.getDataVector()[0] == cp_t{1, 0});
                 sv.applyHadamard({index}, inverse);
                 CAPTURE(sv.getDataVector());
@@ -120,7 +119,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyHadamard",
         SECTION("Apply using dispatcher") {
             for (std::size_t index = 0; index < num_qubits; index++) {
                 StateVectorCudaManaged<TestType> sv{num_qubits};
-                sv.initSV();
 
                 CHECK(sv.getDataVector()[0] == cp_t{1, 0});
                 sv.applyOperation("Hadamard", {index}, inverse);
@@ -148,7 +146,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyPauliX",
         SECTION("Apply directly") {
             for (std::size_t index = 0; index < num_qubits; index++) {
                 StateVectorCudaManaged<TestType> sv{num_qubits};
-                sv.initSV();
                 CHECK(sv.getDataVector()[0] ==
                       cuUtil::ONE<std::complex<TestType>>());
                 sv.applyPauliX({index}, inverse);
@@ -161,7 +158,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyPauliX",
         SECTION("Apply using dispatcher") {
             for (std::size_t index = 0; index < num_qubits; index++) {
                 StateVectorCudaManaged<TestType> sv{num_qubits};
-                sv.initSV();
                 CHECK(sv.getDataVector()[0] ==
                       cuUtil::ONE<std::complex<TestType>>());
                 sv.applyOperation("PauliX", {index}, inverse);
@@ -181,7 +177,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyPauliY",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
         // Test using |+++> state
         sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
                            {{0}, {1}, {2}}, {{false}, {false}, {false}});
@@ -229,7 +224,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyPauliZ",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
         // Test using |+++> state
         sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
                            {{0}, {1}, {2}}, {{false}, {false}, {false}});
@@ -274,7 +268,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyS",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
         // Test using |+++> state
         sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
                            {{0}, {1}, {2}}, {{false}, {false}, {false}});
@@ -323,7 +316,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyT",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
         // Test using |+++> state
         sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
                            {{0}, {1}, {2}}, {{false}, {false}, {false}});
@@ -372,7 +364,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyCNOT",
     {
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
 
         // Test using |+00> state to generate 3-qubit GHZ state
         sv.applyOperation("Hadamard", {0});
@@ -414,7 +405,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applySWAP",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
 
         // Test using |+10> state
         sv.applyOperations({{"Hadamard"}, {"PauliX"}}, {{0}, {1}},
@@ -593,7 +583,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyCY",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
 
         // Test using |+10> state
         sv.applyOperations({{"Hadamard"}, {"PauliX"}}, {{0}, {1}},
@@ -762,7 +751,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyCZ",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
 
         // Test using |+10> state
         sv.applyOperations({{"Hadamard"}, {"PauliX"}}, {{0}, {1}},
@@ -876,7 +864,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyToffoli",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
 
         // Test using |+10> state
         sv.applyOperations({{"Hadamard"}, {"PauliX"}}, {{0}, {1}},
@@ -983,7 +970,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyCSWAP",
         using cp_t = std::complex<TestType>;
         const std::size_t num_qubits = 3;
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.initSV();
 
         // Test using |+10> state
         sv.applyOperations({{"Hadamard"}, {"PauliX"}}, {{0}, {1}},
@@ -1083,68 +1069,15 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVector",
         }
 
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.CopyHostDataToGpu(init_state.data(), init_state.size());
-
-        using index_type =
-            typename std::conditional<std::is_same<PrecisionT, float>::value,
-                                      int32_t, int64_t>::type;
-        // The setStates will shuffle the state vector values on the device with
-        // the following indices and values setting on host. For example, the
-        // values[i] is used to set the indices[i] th element of state vector on
-        // the device. For example, values[2] (init_state[5]) will be copied to
-        // indices[2]th or (4th) element of the state vector.
-        std::vector<index_type> indices = {0, 2, 4, 6, 1, 3, 5, 7};
-
-        std::vector<std::complex<PrecisionT>> values = {
-            init_state[1], init_state[3], init_state[5], init_state[7],
-            init_state[0], init_state[2], init_state[4], init_state[6]};
-
-        sv.template setStateVector<index_type>(values.size(), values.data(),
-                                               indices.data(), false);
-
-        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
-    }
-}
-// LCOV_EXCL_START
-TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVectorwith_thread_setting",
-                   "[StateVectorCudaManaged_Nonparam]", float, double) {
-    using PrecisionT = TestType;
-    const std::size_t num_qubits = 3;
-    std::mt19937 re{1337};
-
-    SECTION("SetStates with a non-default GPU thread setting") {
-        auto init_state =
-            createRandomStateVectorData<PrecisionT>(re, num_qubits);
-        auto expected_state = init_state;
 
-        for (std::size_t i = 0; i < Pennylane::Util::exp2(num_qubits - 1);
-             i++) {
-            std::swap(expected_state[i * 2], expected_state[i * 2 + 1]);
-        }
-
-        StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.CopyHostDataToGpu(init_state.data(), init_state.size());
-
-        using index_type =
-            typename std::conditional<std::is_same<PrecisionT, float>::value,
-                                      int32_t, int64_t>::type;
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
 
-        std::vector<index_type> indices = {0, 2, 4, 6, 1, 3, 5, 7};
-
-        std::vector<std::complex<PrecisionT>> values = {
-            init_state[1], init_state[3], init_state[5], init_state[7],
-            init_state[0], init_state[2], init_state[4], init_state[6]};
-
-        // default setting of the number of threads in a block is 256.
-        const std::size_t threads_per_block = 1024;
-
-        sv.template setStateVector<index_type, threads_per_block>(
-            values.size(), values.data(), indices.data(), false);
-
-        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{0, 1, 2});
+        CHECK(init_state == Pennylane::Util::approx(sv.getDataVector()));
     }
 }
-// LCOV_EXCL_STOP
 
 TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetIthStates",
                    "[StateVectorCudaManaged_Nonparam]", float, double) {
@@ -1156,21 +1089,19 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetIthStates",
         "Set Ith element of the state state on device with data on the host") {
         auto init_state =
             createRandomStateVectorData<PrecisionT>(re, num_qubits);
-        auto expected_state = init_state;
+        std::vector<std::complex<PrecisionT>> expected_state(init_state.size(),
+                                                             {0, 0});
 
-        expected_state[0] = expected_state[1];
-
-        for (std::size_t i = 1; i < Pennylane::Util::exp2(num_qubits); i++) {
-            expected_state[i] = {0, 0};
-        }
+        expected_state[expected_state.size() - 1] = {1.0, 0};
 
         StateVectorCudaManaged<TestType> sv{num_qubits};
         sv.CopyHostDataToGpu(init_state.data(), init_state.size());
 
-        std::size_t index = 0;
-        std::complex<PrecisionT> values = init_state[1];
+        std::vector<std::size_t> state(num_qubits, 1);
+        std::vector<std::size_t> wires(num_qubits, 0);
+        std::iota(wires.begin(), wires.end(), 0);
 
-        sv.setBasisState(values, index, false);
+        sv.setBasisState(state, wires, false);
 
         CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
     }
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Param.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Param.cpp
index c93eba882e..e2485910d9 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Param.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Param.cpp
@@ -43,7 +43,6 @@ TEMPLATE_TEST_CASE("LightningGPU:applyOperation", "[LightningGPU_Param]",
                    double) {
     const std::size_t num_qubits = 1;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     SECTION("Catch failures caused by unsupported named gates") {
         std::string obs = "paulix";
@@ -56,7 +55,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyRX", "[LightningGPU_Param]", double) {
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 1;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{{0.1}, {0.6}};
 
@@ -188,7 +186,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyRZ", "[LightningGPU_Param]", float,
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     // Test using |+++> state
     sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
@@ -250,7 +247,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyPhaseShift", "[LightningGPU_Param]",
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     // Test using |+++> state
     sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
@@ -313,7 +309,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyControlledPhaseShift",
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     // Test using |+++> state
     sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
@@ -387,7 +382,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyRot", "[LightningGPU_Param]", float,
     SECTION("Apply directly") {
         for (std::size_t index = 0; index < num_qubits; index++) {
             StateVectorCudaManaged<TestType> sv_direct{num_qubits};
-            sv_direct.initSV();
 
             sv_direct.applyRot({index}, adjoint, angles[index][0],
                                angles[index][1], angles[index][2]);
@@ -396,7 +390,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyRot", "[LightningGPU_Param]", float,
         }
         for (std::size_t index = 0; index < num_qubits; index++) {
             StateVectorCudaManaged<TestType> sv_direct{num_qubits};
-            sv_direct.initSV();
 
             sv_direct.applyRot({index}, adjoint, angles[index]);
             CHECK(sv_direct.getDataVector() ==
@@ -406,7 +399,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyRot", "[LightningGPU_Param]", float,
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < num_qubits; index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
 
             sv_dispatch.applyOperation("Rot", {index}, adjoint, angles[index]);
             CHECK(sv_dispatch.getDataVector() ==
@@ -422,7 +414,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyCRot", "[LightningGPU_Param]", float,
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8, 2.4};
 
@@ -441,7 +432,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyCRot", "[LightningGPU_Param]", float,
         SECTION("CRot0,1 |000> -> |000>") {
             {
                 StateVectorCudaManaged<TestType> sv_direct{num_qubits};
-                sv_direct.initSV();
 
                 sv_direct.applyCRot({0, 1}, adjoint, angles[0], angles[1],
                                     angles[2]);
@@ -451,7 +441,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyCRot", "[LightningGPU_Param]", float,
             }
             {
                 StateVectorCudaManaged<TestType> sv_direct{num_qubits};
-                sv_direct.initSV();
 
                 sv_direct.applyCRot({0, 1}, adjoint, angles);
 
@@ -461,7 +450,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyCRot", "[LightningGPU_Param]", float,
         }
         SECTION("CRot0,1 |100> -> |1>(a|0>+b|1>)|0>") {
             StateVectorCudaManaged<TestType> sv_direct{num_qubits};
-            sv_direct.initSV();
 
             sv_direct.applyOperation("PauliX", {0});
             sv_direct.applyCRot({0, 1}, adjoint, angles[0], angles[1],
@@ -473,7 +461,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyCRot", "[LightningGPU_Param]", float,
     SECTION("Apply using dispatcher") {
         SECTION("CRot0,1 |100> -> |1>(a|0>+b|1>)|0>") {
             StateVectorCudaManaged<TestType> sv_direct{num_qubits};
-            sv_direct.initSV();
 
             sv_direct.applyOperation("PauliX", {0});
             sv_direct.applyOperation("CRot", {0, 1}, adjoint, angles);
@@ -489,7 +476,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyIsingXX", "[LightningGPU_Param]", float,
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -642,7 +628,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyIsingYY", "[LightningGPU_Param]", float,
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -722,8 +707,7 @@ TEMPLATE_TEST_CASE("LightningGPU::applyIsingYY", "[LightningGPU_Param]", float,
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
-
+            sv_dispatch.resetStateVector();
             sv_dispatch.applyOperation("IsingYY", {0, 1}, true,
                                        {angles[index]});
             CHECK(sv_dispatch.getDataVector() ==
@@ -737,7 +721,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyIsingZZ", "[LightningGPU_Param]", float,
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -796,7 +779,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyIsingZZ", "[LightningGPU_Param]", float,
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
 
             sv_dispatch.applyOperation("IsingZZ", {0, 1}, true,
                                        {angles[index]});
@@ -988,7 +970,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applySingleExcitation",
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -1021,7 +1002,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applySingleExcitation",
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
             sv_dispatch.applyOperation("SingleExcitation", {0, 1}, false,
                                        {angles[index]});
             CHECK(sv_dispatch.getDataVector() ==
@@ -1035,7 +1015,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applySingleExcitationMinus",
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -1101,7 +1080,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applySingleExcitationMinus",
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
 
             sv_dispatch.applyOperation("SingleExcitationMinus", {0, 1}, true,
                                        {angles[index]});
@@ -1116,7 +1094,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applySingleExcitationPlus",
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -1182,7 +1159,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applySingleExcitationPlus",
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
 
             sv_dispatch.applyOperation("SingleExcitationPlus", {0, 1}, true,
                                        {angles[index]});
@@ -1197,7 +1173,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyDoubleExcitation",
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 4;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8, 2.4};
 
@@ -1221,7 +1196,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyDoubleExcitation",
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
 
             sv_dispatch.applyOperation("DoubleExcitation", {0, 1, 2, 3}, false,
                                        {angles[index]});
@@ -1236,7 +1210,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyDoubleExcitationMinus",
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 4;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -1278,7 +1251,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyDoubleExcitationMinus",
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
 
             sv_dispatch.applyOperation("DoubleExcitationMinus", {0, 1, 2, 3},
                                        true, {angles[index]});
@@ -1293,7 +1265,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyDoubleExcitationPlus",
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 4;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -1336,7 +1307,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyDoubleExcitationPlus",
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
             sv_dispatch.applyOperation("DoubleExcitationPlus", {0, 1, 2, 3},
                                        true, {angles[index]});
             CHECK(sv_dispatch.getDataVector() ==
@@ -1350,7 +1320,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyMultiRZ", "[LightningGPU_Param]", float,
     using cp_t = std::complex<TestType>;
     const std::size_t num_qubits = 3;
     StateVectorCudaManaged<TestType> sv{num_qubits};
-    sv.initSV();
 
     const std::vector<TestType> angles{0.3, 0.8};
 
@@ -1412,7 +1381,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyMultiRZ", "[LightningGPU_Param]", float,
     SECTION("Apply using dispatcher") {
         for (std::size_t index = 0; index < angles.size(); index++) {
             StateVectorCudaManaged<TestType> sv_dispatch{num_qubits};
-            sv_dispatch.initSV();
 
             sv_dispatch.applyOperation("MultiRZ", {0, 1}, true,
                                        {angles[index]});
@@ -1437,10 +1405,8 @@ TEMPLATE_TEST_CASE("LightningGPU::applyOperation 1 wire",
 
         SECTION("Apply using dispatcher") {
             StateVectorCudaManaged<TestType> sv{num_qubits};
-            sv.initSV();
 
             StateVectorCudaManaged<TestType> sv_expected{num_qubits};
-            sv_expected.initSV();
 
             for (std::size_t index = 0; index < num_qubits; index++) {
                 sv_expected.applyOperations({{"PauliX"}, {"PauliZ"}},
@@ -1459,9 +1425,7 @@ TEMPLATE_TEST_CASE("LightningGPU::applyOperation 1 wire",
 
         SECTION("Apply using dispatcher") {
             StateVectorCudaManaged<TestType> sv{num_qubits};
-            sv.initSV();
             StateVectorCudaManaged<TestType> sv_expected{num_qubits};
-            sv_expected.initSV();
 
             for (std::size_t index = 0; index < num_qubits; index++) {
                 sv_expected.applyOperations({{"PauliZ"}, {"PauliX"}},
@@ -1478,9 +1442,7 @@ TEMPLATE_TEST_CASE("LightningGPU::applyOperation 1 wire",
 
         SECTION("Apply using dispatcher") {
             StateVectorCudaManaged<TestType> sv{num_qubits};
-            sv.initSV();
             StateVectorCudaManaged<TestType> sv_expected{num_qubits};
-            sv_expected.initSV();
 
             for (std::size_t index = 0; index < num_qubits; index++) {
                 sv_expected.applyOperations({{"PauliX"}, {"PauliY"}},
@@ -1497,9 +1459,7 @@ TEMPLATE_TEST_CASE("LightningGPU::applyOperation 1 wire",
 
         SECTION("Apply using dispatcher") {
             StateVectorCudaManaged<TestType> sv{num_qubits};
-            sv.initSV();
             StateVectorCudaManaged<TestType> sv_expected{num_qubits};
-            sv_expected.initSV();
 
             for (std::size_t index = 0; index < num_qubits; index++) {
                 sv_expected.applyOperations({{"PauliY"}, {"PauliX"}},
@@ -1517,9 +1477,7 @@ TEMPLATE_TEST_CASE("LightningGPU::applyOperation 1 wire",
 
         SECTION("Apply using dispatcher") {
             StateVectorCudaManaged<TestType> sv{num_qubits};
-            sv.initSV();
             StateVectorCudaManaged<TestType> sv_expected{num_qubits};
-            sv_expected.initSV();
 
             for (std::size_t index = 0; index < num_qubits; index++) {
                 sv_expected.applyOperations({{"PauliY"}, {"PauliZ"}},
@@ -1537,9 +1495,7 @@ TEMPLATE_TEST_CASE("LightningGPU::applyOperation 1 wire",
 
         SECTION("Apply using dispatcher") {
             StateVectorCudaManaged<TestType> sv{num_qubits};
-            sv.initSV();
             StateVectorCudaManaged<TestType> sv_expected{num_qubits};
-            sv_expected.initSV();
 
             for (std::size_t index = 0; index < num_qubits; index++) {
                 sv_expected.applyOperations({{"PauliZ"}, {"PauliY"}},
@@ -1557,7 +1513,6 @@ TEMPLATE_TEST_CASE("LightningGPU::applyOperation multiple wires",
     const std::size_t num_qubits = 3;
 
     StateVectorCudaManaged<TestType> sv_init{num_qubits};
-    sv_init.initSV();
 
     sv_init.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
                             {{0}, {1}, {2}}, {false, false, false});
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
index 9b88afa388..968badd4dc 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
@@ -15,6 +15,7 @@
 #include <complex>
 #include <iostream>
 #include <limits>
+#include <numeric>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -158,21 +159,17 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::SetStateVector",
             "the host") {
         StateVectorCudaMPI<PrecisionT> sv(mpi_manager, dt_local, mpi_buffersize,
                                           nGlobalIndexBits, nLocalIndexBits);
-        // The setStates will shuffle the state vector values on the device with
-        // the following indices and values setting on host. For example, the
-        // values[i] is used to set the indices[i] th element of state vector on
-        // the device. For example, values[2] (init_state[5]) will be copied to
-        // indices[2]th or (4th) element of the state vector.
 
-        sv.template setStateVector<index_type>(
-            init_state.size(), init_state.data(), indices.data(), false);
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+        std::vector<std::size_t> wires(num_qubits);
+        std::iota(wires.begin(), wires.end(), 0);
+        sv.setStateVector(values.data(), values.size(), wires);
 
-        mpi_manager.Barrier();
-        sv.CopyGpuDataToHost(local_state.data(),
-                             static_cast<std::size_t>(subSvLength));
-        mpi_manager.Barrier();
+        auto expected_local_state_vector = mpi_manager.scatter<cp_t>(values, 0);
 
-        CHECK(expected_local_state == Pennylane::Util::approx(local_state));
+        CHECK(expected_local_state_vector ==
+              Pennylane::Util::approx(sv.getDataVector()));
     }
 }
 
@@ -189,20 +186,10 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::SetIthStates",
         std::bit_width(static_cast<std::size_t>(mpi_manager.getSize())) - 1;
     std::size_t nLocalIndexBits = num_qubits - nGlobalIndexBits;
     std::size_t subSvLength = 1 << nLocalIndexBits;
-    mpi_manager.Barrier();
-
-    int index;
-    if (mpi_manager.getRank() == 0) {
-        std::mt19937 re{1337};
-        std::uniform_int_distribution<> distr(
-            0, Pennylane::Util::exp2(num_qubits) - 1);
-        index = distr(re);
-    }
-    mpi_manager.Bcast(index, 0);
 
     std::vector<cp_t> expected_state(Pennylane::Util::exp2(num_qubits), {0, 0});
     if (mpi_manager.getRank() == 0) {
-        expected_state[index] = {1.0, 0};
+        expected_state[expected_state.size() - 1] = {1.0, 0};
     }
 
     auto expected_local_state = mpi_manager.scatter(expected_state, 0);
@@ -219,8 +206,10 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::SetIthStates",
         "Set Ith element of the state state on device with data on the host") {
         StateVectorCudaMPI<PrecisionT> sv(mpi_manager, dt_local, mpi_buffersize,
                                           nGlobalIndexBits, nLocalIndexBits);
-        std::complex<PrecisionT> values = {1.0, 0};
-        sv.setBasisState(values, index, false);
+        std::vector<std::size_t> state(num_qubits, 1);
+        std::vector<std::size_t> wires(num_qubits);
+        std::iota(wires.begin(), wires.end(), 0);
+        sv.setBasisState(state, wires, false);
 
         std::vector<cp_t> h_sv0(subSvLength, {0.0, 0.0});
         sv.CopyGpuDataToHost(h_sv0.data(),
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_Param.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_Param.cpp
index a9d5ec106d..17cf43e842 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_Param.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_Param.cpp
@@ -380,7 +380,6 @@ TEMPLATE_TEST_CASE("LightningGPUMPI:applyOperation", "[LightningGPUMPI_Param]",
         std::string obs = "paulix";
         StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
                         nLocalIndexBits);
-        sv.initSV();
         PL_CHECK_THROWS_MATCHES(sv.applyOperation(obs, {0}), LightningException,
                                 "Currently unsupported gate: paulix");
     }
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
index 4e3e93ea79..8a62e89e84 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
@@ -59,7 +59,7 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                         cudaStream_t stream_id);
 
 /**
- * @brief The CUDA kernel that setS state vector data on GPU device from the
+ * @brief The CUDA kernel that sets state vector data on GPU device from the
  * input values (on device) and their corresponding indices (on device)
  * information.
  *
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
index 460a4fa8cb..bcfdd3944c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
@@ -25,6 +25,7 @@
 #include <cuda.h>
 #include <cusparse.h>
 #include <custatevec.h> // custatevecApplyMatrix
+#include <optional>
 #include <random>
 #include <type_traits>
 #include <unordered_map>
@@ -93,16 +94,10 @@ class Measurements final
      */
     auto probs(const std::vector<std::size_t> &wires)
         -> std::vector<PrecisionT> {
-        PL_ABORT_IF_NOT(std::is_sorted(wires.cbegin(), wires.cend()) ||
-                            std::is_sorted(wires.rbegin(), wires.rend()),
-                        "LightningGPU does not currently support out-of-order "
-                        "wire indices with probability calculations");
-
         // Data return type fixed as double in custatevec function call
         std::vector<double> probabilities(Pennylane::Util::exp2(wires.size()));
         // this should be built upon by the wires not participating
-        int maskLen =
-            0; // static_cast<int>(BaseType::getNumQubits() - wires.size());
+        int maskLen = 0;
         int *maskBitString = nullptr; //
         int *maskOrdering = nullptr;
 
@@ -124,6 +119,8 @@ class Measurements final
                                this->_statevector.getNumQubits() - 1 - x);
                        });
 
+        std::reverse(wires_int.begin(), wires_int.end());
+
         PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumArray(
             /* custatevecHandle_t */ this->_statevector.getCusvHandle(),
             /* const void* */ this->_statevector.getData(),
@@ -218,7 +215,9 @@ class Measurements final
      * be accessed using the stride sample_id*num_qubits, where sample_id is a
      * number between 0 and num_samples-1.
      */
-    auto generate_samples(std::size_t num_samples) -> std::vector<std::size_t> {
+    auto generate_samples(std::size_t num_samples,
+                          const std::optional<std::size_t> &seed = std::nullopt)
+        -> std::vector<std::size_t> {
         std::vector<double> rand_nums(num_samples);
         custatevecSamplerDescriptor_t sampler;
 
@@ -238,7 +237,11 @@ class Measurements final
             data_type = CUDA_C_32F;
         }
 
-        this->setRandomSeed();
+        if (seed.has_value()) {
+            this->setSeed(seed.value());
+        } else {
+            this->setRandomSeed();
+        }
         std::uniform_real_distribution<PrecisionT> dis(0.0, 1.0);
         for (std::size_t n = 0; n < num_samples; n++) {
             rand_nums[n] = dis(this->rng);
@@ -273,7 +276,7 @@ class Measurements final
         PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerSample(
             this->_statevector.getCusvHandle(), sampler, bitStrings.data(),
             bitOrdering.data(), bitStringLen, rand_nums.data(), num_samples,
-            CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER));
+            CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
         PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
             this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
index 126ce2e686..6fee1711d2 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
@@ -130,6 +130,8 @@ class MeasurementsMPI final
             }
         }
 
+        std::reverse(wires_local.begin(), wires_local.end());
+
         std::vector<double> local_probabilities(
             Pennylane::Util::exp2(wires_local.size()));
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Expval.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Expval.cpp
index 3de2f6aab6..28a04d6d72 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Expval.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Expval.cpp
@@ -50,7 +50,6 @@ TEMPLATE_TEST_CASE("[Identity]", "[StateVectorCudaManaged_Expval]", float,
     const std::size_t num_qubits = 3;
     auto ONE = TestType(1);
     StateVectorT sv{num_qubits};
-    sv.initSV();
     auto m = Measurements(sv);
 
     SECTION("Using expval") {
@@ -73,7 +72,6 @@ TEMPLATE_TEST_CASE("[PauliX]", "[StateVectorCudaManaged_Expval]", float,
 
         SECTION("Using expval") {
             StateVectorT sv{num_qubits};
-            sv.initSV();
             auto m = Measurements(sv);
             sv.applyOperations({{"Hadamard"}, {"CNOT"}, {"CNOT"}},
                                {{0}, {0, 1}, {1, 2}},
@@ -85,7 +83,6 @@ TEMPLATE_TEST_CASE("[PauliX]", "[StateVectorCudaManaged_Expval]", float,
 
         SECTION("Using expval: Plus states") {
             StateVectorT sv{num_qubits};
-            sv.initSV();
             auto m = Measurements(sv);
             sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
                                {{0}, {1}, {2}}, {{false}, {false}, {false}});
@@ -96,7 +93,6 @@ TEMPLATE_TEST_CASE("[PauliX]", "[StateVectorCudaManaged_Expval]", float,
 
         SECTION("Using expval: Minus states") {
             StateVectorT sv{num_qubits};
-            sv.initSV();
             auto m = Measurements(sv);
             sv.applyOperations(
                 {{"PauliX"},
@@ -126,7 +122,6 @@ TEMPLATE_TEST_CASE("[PauliY]", "[StateVectorCudaManaged_Expval]", float,
 
         SECTION("Using expval") {
             StateVectorT sv{num_qubits};
-            sv.initSV();
             auto m = Measurements(sv);
             sv.applyOperations({{"Hadamard"}, {"CNOT"}, {"CNOT"}},
                                {{0}, {0, 1}, {1, 2}},
@@ -138,7 +133,6 @@ TEMPLATE_TEST_CASE("[PauliY]", "[StateVectorCudaManaged_Expval]", float,
 
         SECTION("Using expval: Plus i states") {
             StateVectorT sv{num_qubits};
-            sv.initSV();
             auto m = Measurements(sv);
             sv.applyOperations({{"RX"}, {"RX"}, {"RX"}}, {{0}, {1}, {2}},
                                {{false}, {false}, {false}},
@@ -150,7 +144,6 @@ TEMPLATE_TEST_CASE("[PauliY]", "[StateVectorCudaManaged_Expval]", float,
 
         SECTION("Using expval: Minus i states") {
             StateVectorT sv{num_qubits};
-            sv.initSV();
             auto m = Measurements(sv);
             sv.applyOperations({{"RX"}, {"RX"}, {"RX"}}, {{0}, {1}, {2}},
                                {{false}, {false}, {false}},
@@ -191,7 +184,6 @@ TEMPLATE_TEST_CASE("[Hadamard]", "[StateVectorCudaManaged_Expval]", float,
 
         SECTION("Using expval") {
             StateVectorT sv{num_qubits};
-            sv.initSV();
             auto m = Measurements(sv);
             sv.applyOperation("PauliX", {0});
             auto ob = NamedObs<StateVectorT>("Hadamard", {0});
@@ -209,7 +201,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::Hamiltonian_expval",
 
     SECTION("GetExpectationIdentity") {
         StateVectorT sv{num_qubits};
-        sv.initSV();
         auto m = Measurements(sv);
         std::vector<std::size_t> wires{0, 1, 2};
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Measure.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Measure.cpp
index f23497f0c7..4f3efaade5 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Measure.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Measure.cpp
@@ -257,7 +257,7 @@ TEMPLATE_TEST_CASE("Probabilities", "[Measures]", float, double) {
     using StateVectorT = StateVectorCudaManaged<TestType>;
     // Probabilities calculated with Pennylane default.qubit:
     std::vector<std::pair<std::vector<std::size_t>, std::vector<TestType>>>
-        input = {{{2, 1, 0},
+        input = {{{0, 1, 2},
                   {0.67078706, 0.03062806, 0.0870997, 0.00397696, 0.17564072,
                    0.00801973, 0.02280642, 0.00104134}}};
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Var.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Var.cpp
index 3b40d093be..deccedee0c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Var.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Var.cpp
@@ -38,7 +38,6 @@ TEMPLATE_TEST_CASE("Test variance of NamedObs", "[StateVectorCudaManaged_Var]",
     const std::size_t num_qubits = 2;
     SECTION("var(PauliX[0])") {
         StateVectorT sv{num_qubits};
-        sv.initSV();
         auto m = Measurements<StateVectorT>(sv);
 
         sv.applyOperations(
@@ -53,7 +52,6 @@ TEMPLATE_TEST_CASE("Test variance of NamedObs", "[StateVectorCudaManaged_Var]",
 
     SECTION("var(PauliY[0])") {
         StateVectorT sv{num_qubits};
-        sv.initSV();
         auto m = Measurements<StateVectorT>(sv);
 
         sv.applyOperations(
@@ -68,7 +66,6 @@ TEMPLATE_TEST_CASE("Test variance of NamedObs", "[StateVectorCudaManaged_Var]",
 
     SECTION("var(PauliZ[1])") {
         StateVectorT sv{num_qubits};
-        sv.initSV();
         auto m = Measurements<StateVectorT>(sv);
 
         sv.applyOperations(
@@ -89,7 +86,6 @@ TEMPLATE_TEST_CASE("Test variance of HermitianObs",
     using ComplexT = typename StateVectorT::ComplexT;
     SECTION("Using var") {
         StateVectorT sv{num_qubits};
-        sv.initSV();
         auto m = Measurements<StateVectorT>(sv);
 
         sv.applyOperations(
@@ -122,7 +118,6 @@ TEMPLATE_TEST_CASE("Test variance of TensorProdObs",
     const std::size_t num_qubits = 3;
     SECTION("Using var") {
         StateVectorT sv{num_qubits};
-        sv.initSV();
         auto m = Measurements<StateVectorT>(sv);
 
         sv.applyOperations(
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Expval.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Expval.cpp
index d3c55ff7ae..bbc1dba860 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Expval.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Expval.cpp
@@ -71,7 +71,6 @@ TEMPLATE_TEST_CASE("[Identity]", "[StateVectorCudaMPI_Expval]", float, double) {
 
     StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
                     nLocalIndexBits);
-    sv.initSV();
 
     auto m = MeasurementsMPI(sv);
 
@@ -112,7 +111,6 @@ TEMPLATE_TEST_CASE("[PauliX]", "[StateVectorCudaMPI_Expval]", float, double) {
         SECTION("Using expval") {
             StateVectorT sv(mpi_manager, dt_local, mpi_buffersize,
                             nGlobalIndexBits, nLocalIndexBits);
-            sv.initSV();
 
             auto m = MeasurementsMPI(sv);
             sv.applyOperations({{"Hadamard"}, {"CNOT"}, {"CNOT"}},
@@ -126,7 +124,6 @@ TEMPLATE_TEST_CASE("[PauliX]", "[StateVectorCudaMPI_Expval]", float, double) {
         SECTION("Using expval: Plus states") {
             StateVectorT sv(mpi_manager, dt_local, mpi_buffersize,
                             nGlobalIndexBits, nLocalIndexBits);
-            sv.initSV();
             auto m = MeasurementsMPI(sv);
             sv.applyOperations({{"Hadamard"}, {"Hadamard"}, {"Hadamard"}},
                                {{0}, {1}, {2}}, {{false}, {false}, {false}});
@@ -138,7 +135,6 @@ TEMPLATE_TEST_CASE("[PauliX]", "[StateVectorCudaMPI_Expval]", float, double) {
         SECTION("Using expval: Minus states") {
             StateVectorT sv(mpi_manager, dt_local, mpi_buffersize,
                             nGlobalIndexBits, nLocalIndexBits);
-            sv.initSV();
             auto m = MeasurementsMPI(sv);
             sv.applyOperations(
                 {{"PauliX"},
@@ -185,7 +181,6 @@ TEMPLATE_TEST_CASE("[PauliY]", "[StateVectorCudaMPI_Expval]", float, double) {
         SECTION("Using expval") {
             StateVectorT sv(mpi_manager, dt_local, mpi_buffersize,
                             nGlobalIndexBits, nLocalIndexBits);
-            sv.initSV();
             auto m = MeasurementsMPI(sv);
             sv.applyOperations({{"Hadamard"}, {"CNOT"}, {"CNOT"}},
                                {{0}, {0, 1}, {1, 2}},
@@ -198,7 +193,6 @@ TEMPLATE_TEST_CASE("[PauliY]", "[StateVectorCudaMPI_Expval]", float, double) {
         SECTION("Using expval: Plus i states") {
             StateVectorT sv(mpi_manager, dt_local, mpi_buffersize,
                             nGlobalIndexBits, nLocalIndexBits);
-            sv.initSV();
             auto m = MeasurementsMPI(sv);
             sv.applyOperations({{"RX"}, {"RX"}, {"RX"}}, {{0}, {1}, {2}},
                                {{false}, {false}, {false}},
@@ -211,7 +205,6 @@ TEMPLATE_TEST_CASE("[PauliY]", "[StateVectorCudaMPI_Expval]", float, double) {
         SECTION("Using expval: Minus i states") {
             StateVectorT sv(mpi_manager, dt_local, mpi_buffersize,
                             nGlobalIndexBits, nLocalIndexBits);
-            sv.initSV();
             auto m = MeasurementsMPI(sv);
             sv.applyOperations({{"RX"}, {"RX"}, {"RX"}}, {{0}, {1}, {2}},
                                {{false}, {false}, {false}},
@@ -293,7 +286,6 @@ TEMPLATE_TEST_CASE("[Hadamard]", "[StateVectorCudaMPI_Expval]", float, double) {
         SECTION("Using expval") {
             StateVectorT sv(mpi_manager, dt_local, mpi_buffersize,
                             nGlobalIndexBits, nLocalIndexBits);
-            sv.initSV();
             auto m = MeasurementsMPI(sv);
             sv.applyOperation("PauliX", {0});
             auto ob = NamedObsMPI<StateVectorT>("Hadamard", {0});
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Measure.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Measure.cpp
index c77f4e2215..7bdc578f77 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Measure.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Measure.cpp
@@ -411,7 +411,7 @@ TEMPLATE_TEST_CASE("Probabilities", "[MeasuresMPI]", double) {
     using StateVectorT = StateVectorCudaMPI<TestType>;
     // Probabilities calculated with Pennylane default.qubit:
     std::vector<std::pair<std::vector<std::size_t>, std::vector<TestType>>>
-        input = {{{2, 1, 0},
+        input = {{{0, 1, 2},
                   {0.67078706, 0.03062806, 0.0870997, 0.00397696, 0.17564072,
                    0.00801973, 0.02280642, 0.00104134}}};
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Var.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Var.cpp
index 0a9ed9c33b..cfe9675d0d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Var.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Var.cpp
@@ -60,7 +60,6 @@ TEMPLATE_TEST_CASE("Test variance of NamedObs", "[StateVectorCudaMPI_Var]",
     SECTION("var(PauliX[0])") {
         StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
                         nLocalIndexBits);
-        sv.initSV();
 
         auto m = MeasurementsMPI(sv);
 
@@ -77,7 +76,6 @@ TEMPLATE_TEST_CASE("Test variance of NamedObs", "[StateVectorCudaMPI_Var]",
     SECTION("var(PauliY[0])") {
         StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
                         nLocalIndexBits);
-        sv.initSV();
 
         auto m = MeasurementsMPI(sv);
 
@@ -94,7 +92,6 @@ TEMPLATE_TEST_CASE("Test variance of NamedObs", "[StateVectorCudaMPI_Var]",
     SECTION("var(PauliZ[1])") {
         StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
                         nLocalIndexBits);
-        sv.initSV();
 
         auto m = MeasurementsMPI(sv);
 
@@ -135,7 +132,6 @@ TEMPLATE_TEST_CASE("Test variance of HermitianObs", "[StateVectorCudaMPI_Var]",
     SECTION("Using var") {
         StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
                         nLocalIndexBits);
-        sv.initSV();
 
         auto m = MeasurementsMPI(sv);
 
@@ -188,7 +184,6 @@ TEMPLATE_TEST_CASE("Test variance of TensorProdObs", "[StateVectorCudaMPI_Var]",
     SECTION("Using var") {
         StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
                         nLocalIndexBits);
-        sv.initSV();
 
         auto m = MeasurementsMPI(sv);
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
index 4003395b53..841074474b 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
@@ -266,3 +266,45 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::StateVectorCudaManaged",
         REQUIRE(std::is_constructible_v<StateVectorT, const StateVectorT &>);
     }
 }
+
+TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse",
+                   "[StateVectorCudaManaged]", float, double) {
+    using PrecisionT = TestType;
+    using ComplexT = typename StateVectorCudaManaged<PrecisionT>::ComplexT;
+    using CFP_t = typename StateVectorCudaManaged<PrecisionT>::CFP_t;
+    using TestVectorT = TestVector<ComplexT>;
+
+    std::size_t wire = GENERATE(0, 1, 2);
+    std::size_t branch = GENERATE(0, 1);
+    constexpr std::size_t num_qubits = 3;
+
+    // TODO @tomlqc use same template for testing all Lightning flavours?
+
+    SECTION("Collapse the state vector after having measured one of the "
+            "qubits.") {
+        TestVectorT init_state = createPlusState_<ComplexT>(num_qubits);
+
+        const ComplexT coef{0.5, PrecisionT{0.0}};
+        const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}};
+
+        std::vector<std::vector<std::vector<ComplexT>>> expected_state = {
+            {{coef, coef, coef, coef, zero, zero, zero, zero},
+             {coef, coef, zero, zero, coef, coef, zero, zero},
+             {coef, zero, coef, zero, coef, zero, coef, zero}},
+            {{zero, zero, zero, zero, coef, coef, coef, coef},
+             {zero, zero, coef, coef, zero, zero, coef, coef},
+             {zero, coef, zero, coef, zero, coef, zero, coef}},
+        };
+
+        StateVectorCudaManaged<PrecisionT> sv(
+            reinterpret_cast<CFP_t *>(init_state.data()), init_state.size());
+
+        sv.collapse(wire, branch);
+
+        PrecisionT eps = std::numeric_limits<PrecisionT>::epsilon() * 1e2;
+        REQUIRE(isApproxEqual(sv.getDataVector().data(),
+                              sv.getDataVector().size(),
+                              expected_state[branch][wire].data(),
+                              expected_state[branch][wire].size(), eps));
+    }
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
index 6dd5a01590..0c119409fd 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
@@ -36,6 +36,7 @@
 namespace {
 using namespace Pennylane::LightningGPU;
 using namespace Pennylane::LightningGPU::MPI;
+using namespace Pennylane::LightningGPU::Util;
 using namespace Pennylane::Util;
 
 using Pennylane::Util::isApproxEqual;
@@ -52,6 +53,23 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Constructibility",
     }
 }
 
+TEMPLATE_TEST_CASE("cuStateVec_helper::compute_local_index",
+                   "[Default Constructibility]", StateVectorCudaMPI<>) {
+    const std::size_t local_num_qubits = 4;
+
+    SECTION("compute_local_index, index inside the current qubits set") {
+        const std::size_t index = 2; // 0b00010
+        std::size_t local_index = compute_local_index(index, local_num_qubits);
+        REQUIRE(local_index == index);
+    }
+
+    SECTION("compute_local_index, index outside the current qubits set") {
+        const std::size_t index = 16; // 0b10000
+        std::size_t local_index = compute_local_index(index, local_num_qubits);
+        REQUIRE(local_index == 0);
+    }
+}
+
 TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaMPI::Constructibility",
                            "[General Constructibility]", (StateVectorCudaMPI),
                            (float, double)) {
@@ -299,4 +317,4 @@ TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaMPI::applyOperations",
                                          {false, false}, {{0.0}}),
             LightningException, "must all be equal"); // invalid parameters
     }
-}
\ No newline at end of file
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
index 8bd27c2dc8..ffdefe3e25 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
@@ -101,4 +101,22 @@ inline SharedCusvHandle make_shared_cusv_handle() {
     PL_CUSTATEVEC_IS_SUCCESS(custatevecCreate(&h));
     return {h, handleDeleter()};
 }
+
+/**
+ * @brief Compute the local index from a given index in multi-gpu workflow
+ *
+ * @param index Global index of the target element.
+ * @param num_qubits Number of wires within the local devices.
+ *
+ *  @return local_index Local index of the target element.
+ */
+inline std::size_t compute_local_index(const std::size_t index,
+                                       const std::size_t num_qubits) {
+    // TODO: bound check for the left shift operation here
+    constexpr std::size_t one{1U};
+    const std::size_t local_index =
+        (index >> num_qubits) * (one << num_qubits) ^ index;
+    return local_index;
+}
+
 } // namespace Pennylane::LightningGPU::Util
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/LightningKokkosSimulator.cpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/LightningKokkosSimulator.cpp
index d01c0340d2..04fab62ac5 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/LightningKokkosSimulator.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/LightningKokkosSimulator.cpp
@@ -338,13 +338,22 @@ void LightningKokkosSimulator::PartialProbs(
     std::move(dv_probs.begin(), dv_probs.end(), probs.begin());
 }
 
-void LightningKokkosSimulator::Sample(DataView<double, 2> &samples,
-                                      std::size_t shots) {
+std::vector<size_t> LightningKokkosSimulator::GenerateSamples(size_t shots) {
+    // generate_samples is a member function of the Measures class.
     Pennylane::LightningKokkos::Measures::Measurements<StateVectorT> m{
         *(this->device_sv)};
+
     // PL-Lightning-Kokkos generates samples using the alias method.
     // Reference: https://en.wikipedia.org/wiki/Inverse_transform_sampling
-    auto li_samples = m.generate_samples(shots);
+    if (this->gen) {
+        return m.generate_samples(shots, (*(this->gen))());
+    }
+    return m.generate_samples(shots);
+}
+
+void LightningKokkosSimulator::Sample(DataView<double, 2> &samples,
+                                      std::size_t shots) {
+    auto li_samples = this->GenerateSamples(shots);
 
     RT_FAIL_IF(samples.size() != li_samples.size(),
                "Invalid size for the pre-allocated samples");
@@ -377,13 +386,7 @@ void LightningKokkosSimulator::PartialSample(
     // get device wires
     auto &&dev_wires = getDeviceWires(wires);
 
-    // generate_samples is a member function of the MeasuresKokkos class.
-    Pennylane::LightningKokkos::Measures::Measurements<StateVectorT> m{
-        *(this->device_sv)};
-
-    // PL-Lightning-Kokkos generates samples using the alias method.
-    // Reference: https://en.wikipedia.org/wiki/Inverse_transform_sampling
-    auto li_samples = m.generate_samples(shots);
+    auto li_samples = this->GenerateSamples(shots);
 
     // The lightning samples are layed out as a single vector of size
     // shots*qubits, where each element represents a single bit. The
@@ -407,13 +410,7 @@ void LightningKokkosSimulator::Counts(DataView<double, 1> &eigvals,
     RT_FAIL_IF(eigvals.size() != numElements || counts.size() != numElements,
                "Invalid size for the pre-allocated counts");
 
-    // generate_samples is a member function of the MeasuresKokkos class.
-    Pennylane::LightningKokkos::Measures::Measurements<StateVectorT> m{
-        *(this->device_sv)};
-
-    // PL-Lightning-Kokkos generates samples using the alias method.
-    // Reference: https://en.wikipedia.org/wiki/Inverse_transform_sampling
-    auto li_samples = m.generate_samples(shots);
+    auto li_samples = this->GenerateSamples(shots);
 
     // Fill the eigenvalues with the integer representation of the corresponding
     // computational basis bitstring. In the future, eigenvalues can also be
@@ -451,13 +448,7 @@ void LightningKokkosSimulator::PartialCounts(
     // get device wires
     auto &&dev_wires = getDeviceWires(wires);
 
-    // generate_samples is a member function of the MeasuresKokkos class.
-    Pennylane::LightningKokkos::Measures::Measurements<StateVectorT> m{
-        *(this->device_sv)};
-
-    // PL-Lightning-Kokkos generates samples using the alias method.
-    // Reference: https://en.wikipedia.org/wiki/Inverse_transform_sampling
-    auto li_samples = m.generate_samples(shots);
+    auto li_samples = this->GenerateSamples(shots);
 
     // Fill the eigenvalues with the integer representation of the corresponding
     // computational basis bitstring. In the future, eigenvalues can also be
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/LightningKokkosSimulator.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/LightningKokkosSimulator.hpp
index 890c3a267f..d28959f7c3 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/LightningKokkosSimulator.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/LightningKokkosSimulator.hpp
@@ -96,6 +96,8 @@ class LightningKokkosSimulator final : public Catalyst::Runtime::QuantumDevice {
         return res;
     }
 
+    auto GenerateSamples(size_t shots) -> std::vector<size_t>;
+
   public:
     explicit LightningKokkosSimulator(const std::string &kwargs = "{}") {
         auto &&args = Catalyst::Runtime::parse_kwargs(kwargs);
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/tests/Test_LightningKokkosMeasures.cpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/tests/Test_LightningKokkosMeasures.cpp
index 7208732a3b..d32e6100ef 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/tests/Test_LightningKokkosMeasures.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/catalyst/tests/Test_LightningKokkosMeasures.cpp
@@ -1754,26 +1754,71 @@ TEST_CASE("Counts and PartialCounts tests with numWires=0-4 shots=100",
 }
 
 TEST_CASE("Measurement with a seeded device", "[Measures]") {
-    for (std::size_t _ = 0; _ < 5; _++) {
-        std::unique_ptr<LKSimulator> sim = std::make_unique<LKSimulator>();
-        std::unique_ptr<LKSimulator> sim1 = std::make_unique<LKSimulator>();
+    std::array<std::unique_ptr<LKSimulator>, 2> sims;
+    std::vector<std::mt19937> gens{std::mt19937{37}, std::mt19937{37}};
 
-        std::mt19937 gen(37);
-        sim->SetDevicePRNG(&gen);
+    auto circuit = [](LKSimulator &sim, std::mt19937 &gen) {
+        sim.SetDevicePRNG(&gen);
         std::vector<intptr_t> Qs;
         Qs.reserve(1);
-        Qs.push_back(sim->AllocateQubit());
-        sim->NamedOperation("Hadamard", {}, {Qs[0]}, false);
-        auto m = sim->Measure(Qs[0]);
-
-        std::mt19937 gen1(37);
-        sim1->SetDevicePRNG(&gen1);
-        std::vector<intptr_t> Qs1;
-        Qs1.reserve(1);
-        Qs1.push_back(sim1->AllocateQubit());
-        sim1->NamedOperation("Hadamard", {}, {Qs1[0]}, false);
-        auto m1 = sim1->Measure(Qs1[0]);
-
-        CHECK(*m == *m1);
+        Qs.push_back(sim.AllocateQubit());
+        sim.NamedOperation("Hadamard", {}, {Qs[0]}, false);
+        auto m = sim.Measure(Qs[0]);
+        return m;
+    };
+
+    for (std::size_t trial = 0; trial < 5; trial++) {
+        sims[0] = std::make_unique<LKSimulator>();
+        sims[1] = std::make_unique<LKSimulator>();
+
+        auto m0 = circuit(*(sims[0]), gens[0]);
+        auto m1 = circuit(*(sims[1]), gens[1]);
+
+        CHECK(*m0 == *m1);
+    }
+}
+
+TEST_CASE("Sample with a seeded device", "[Measures]") {
+    std::size_t shots = 100;
+    std::array<std::unique_ptr<LKSimulator>, 2> sims;
+    std::vector<std::vector<double>> sample_vec(2,
+                                                std::vector<double>(shots * 4));
+
+    std::vector<MemRefT<double, 2>> buffers{
+        MemRefT<double, 2>{
+            sample_vec[0].data(), sample_vec[0].data(), 0, {shots, 1}, {1, 1}},
+        MemRefT<double, 2>{
+            sample_vec[1].data(), sample_vec[1].data(), 0, {shots, 1}, {1, 1}},
+    };
+    std::vector<DataView<double, 2>> views{
+        DataView<double, 2>(buffers[0].data_aligned, buffers[0].offset,
+                            buffers[0].sizes, buffers[0].strides),
+        DataView<double, 2>(buffers[1].data_aligned, buffers[1].offset,
+                            buffers[1].sizes, buffers[1].strides)};
+
+    std::vector<std::mt19937> gens{std::mt19937{37}, std::mt19937{37}};
+
+    auto circuit = [shots](LKSimulator &sim, DataView<double, 2> &view,
+                           std::mt19937 &gen) {
+        sim.SetDevicePRNG(&gen);
+        std::vector<intptr_t> Qs;
+        Qs.reserve(1);
+        Qs.push_back(sim.AllocateQubit());
+        sim.NamedOperation("Hadamard", {}, {Qs[0]}, false);
+        sim.NamedOperation("RX", {0.5}, {Qs[0]}, false);
+        sim.Sample(view, shots);
+    };
+
+    for (std::size_t trial = 0; trial < 5; trial++) {
+        sims[0] = std::make_unique<LKSimulator>();
+        sims[1] = std::make_unique<LKSimulator>();
+
+        for (std::size_t sim_idx = 0; sim_idx < sims.size(); sim_idx++) {
+            circuit(*(sims[sim_idx]), views[sim_idx], gens[sim_idx]);
+        }
+
+        for (std::size_t i = 0; i < sample_vec[0].size(); i++) {
+            CHECK((sample_vec[0][i] == sample_vec[1][i]));
+        }
     }
 }
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp
index 28449e5015..ee8684e814 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp
@@ -14,6 +14,7 @@
 #pragma once
 #include <chrono>
 #include <cstdint>
+#include <optional>
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
@@ -649,13 +650,16 @@ class Measurements final
      * Reference https://en.wikipedia.org/wiki/Inverse_transform_sampling
      *
      * @param num_samples Number of Samples
+     * @param seed Seed to generate the samples from
      *
      * @return std::vector<std::size_t> to the samples.
      * Each sample has a length equal to the number of qubits. Each sample can
      * be accessed using the stride sample_id*num_qubits, where sample_id is a
      * number between 0 and num_samples-1.
      */
-    auto generate_samples(std::size_t num_samples) -> std::vector<std::size_t> {
+    auto generate_samples(std::size_t num_samples,
+                          const std::optional<std::size_t> &seed = std::nullopt)
+        -> std::vector<std::size_t> {
         const std::size_t num_qubits = this->_statevector.getNumQubits();
         const std::size_t N = this->_statevector.getLength();
         Kokkos::View<std::size_t *> samples("num_samples",
@@ -674,10 +678,12 @@ class Measurements final
             });
 
         // Sampling using Random_XorShift64_Pool
-        Kokkos::Random_XorShift64_Pool<> rand_pool(
-            std::chrono::high_resolution_clock::now()
-                .time_since_epoch()
-                .count());
+        auto rand_pool = seed.has_value()
+                             ? Kokkos::Random_XorShift64_Pool<>(seed.value())
+                             : Kokkos::Random_XorShift64_Pool<>(
+                                   std::chrono::high_resolution_clock::now()
+                                       .time_since_epoch()
+                                       .count());
 
         Kokkos::parallel_for(
             Kokkos::RangePolicy<KokkosExecSpace>(0, num_samples),
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index 4bf72e332b..d57bd70631 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -23,6 +23,7 @@
 #include <algorithm>
 #include <complex>
 #include <cstdio>
+#include <optional>
 #include <random>
 #include <type_traits>
 #include <unordered_map>
@@ -573,14 +574,17 @@ class Measurements final
      * Reference: https://en.wikipedia.org/wiki/Alias_method
      *
      * @param num_samples The number of samples to generate.
+     * @param seed Seed to generate the samples from
      * @return 1-D vector of samples in binary, each sample is
      * separated by a stride equal to the number of qubits.
      */
-    std::vector<std::size_t> generate_samples(const std::size_t num_samples) {
+    std::vector<std::size_t>
+    generate_samples(const std::size_t num_samples,
+                     const std::optional<std::size_t> &seed = std::nullopt) {
         const std::size_t num_qubits = this->_statevector.getNumQubits();
         std::vector<std::size_t> wires(num_qubits);
         std::iota(wires.begin(), wires.end(), 0);
-        return generate_samples(wires, num_samples);
+        return generate_samples(wires, num_samples, seed);
     }
 
     /**
@@ -588,15 +592,21 @@ class Measurements final
      *
      * @param wires Sample are generated for the specified wires.
      * @param num_samples The number of samples to generate.
+     * @param seed Seed to generate the samples from
      * @return 1-D vector of samples in binary, each sample is
      * separated by a stride equal to the number of qubits.
      */
     std::vector<std::size_t>
     generate_samples(const std::vector<std::size_t> &wires,
-                     const std::size_t num_samples) {
+                     const std::size_t num_samples,
+                     const std::optional<std::size_t> &seed = std::nullopt) {
         const std::size_t n_wires = wires.size();
         std::vector<std::size_t> samples(num_samples * n_wires);
-        this->setRandomSeed();
+        if (seed.has_value()) {
+            this->setSeed(seed.value());
+        } else {
+            this->setRandomSeed();
+        }
         DiscreteRandomVariable<PrecisionT> drv{this->rng, probs(wires)};
         // The Python layer expects a 2D array with dimensions (n_samples x
         // n_wires) and hence the linear index is `s * n_wires + (n_wires - 1 -
diff --git a/pennylane_lightning/core/src/utils/Util.hpp b/pennylane_lightning/core/src/utils/Util.hpp
index e0d3a1170e..5478cdbdcb 100644
--- a/pennylane_lightning/core/src/utils/Util.hpp
+++ b/pennylane_lightning/core/src/utils/Util.hpp
@@ -21,6 +21,7 @@
 #include <algorithm>
 #include <cmath>
 #include <complex>
+#include <concepts> // integral, floating_point
 #include <numbers>
 #include <numeric> // transform_reduce
 #include <set>
@@ -41,6 +42,7 @@ namespace Pennylane::Util {
  * @return constexpr std::complex<T>
  */
 template <class T, class U = T>
+    requires std::integral<U> || std::floating_point<U>
 inline static constexpr auto ConstMult(U a, std::complex<T> b)
     -> std::complex<T> {
     return {a * b.real(), a * b.imag()};
diff --git a/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
new file mode 100644
index 0000000000..50f9acef38
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
@@ -0,0 +1,248 @@
+# Copyright 2018-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""
+Internal methods for adjoint Jacobian differentiation method.
+"""
+
+from __future__ import annotations
+
+from warnings import warn
+
+try:
+    from pennylane_lightning.lightning_gpu_ops import DevPool
+    from pennylane_lightning.lightning_gpu_ops.algorithms import (
+        AdjointJacobianC64,
+        AdjointJacobianC128,
+        create_ops_listC64,
+        create_ops_listC128,
+    )
+
+    try:
+        from pennylane_lightning.lightning_gpu_ops.algorithmsMPI import (
+            AdjointJacobianMPIC64,
+            AdjointJacobianMPIC128,
+            create_ops_listMPIC64,
+            create_ops_listMPIC128,
+        )
+
+        mpi_error = None
+        MPI_SUPPORT = True
+    except ImportError as ex_mpi:
+        mpi_error = ex_mpi
+        MPI_SUPPORT = False
+
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+
+import numpy as np
+from pennylane import BasisState, StatePrep
+from pennylane.operation import Operation
+from pennylane.tape import QuantumTape
+from scipy.sparse import csr_matrix
+
+# pylint: disable=ungrouped-imports
+from pennylane_lightning.core._adjoint_jacobian_base import LightningBaseAdjointJacobian
+from pennylane_lightning.core._serialize import QuantumScriptSerializer
+
+
+class LightningGPUAdjointJacobian(LightningBaseAdjointJacobian):
+    """Check and execute the adjoint Jacobian differentiation method.
+
+    Args:
+        qubit_state(LightningGPUStateVector): State Vector to calculate the adjoint Jacobian with.
+        batch_obs(bool): If serialized tape is to be batched or not.
+            For Lightning GPU, distribute the observations across GPUs in the same node. Defaults to False.
+            For Lightning GPU-MPI, if `batch_obs=False` the computation requires more memory and is faster,
+            while `batch_obs=True` allows a larger number of qubits simulation
+            at the expense of high computational cost. Defaults to False.
+    """
+
+    # pylint: disable=too-few-public-methods
+
+    def __init__(
+        self,
+        qubit_state: LightningGPUStateVector,  # pylint: disable=undefined-variable
+        batch_obs: bool = False,
+    ) -> None:
+
+        super().__init__(qubit_state, batch_obs)
+
+        self._dp = DevPool()
+
+        self._use_mpi = qubit_state._mpi_handler.use_mpi
+
+        if self._use_mpi:
+            self._mpi_handler = qubit_state._mpi_handler
+
+        # Initialize the C++ binds
+        self._jacobian_lightning, self._create_ops_list_lightning = self._adjoint_jacobian_dtype()
+
+        # Warning about performance with MPI and batch observation
+        if self._use_mpi and not self._batch_obs:
+            warn(
+                "Using LightningGPU with `batch_obs=False` and `use_mpi=True` has the limitation of requiring more memory. If you want to allocate larger number of qubits use the option `batch_obs=True`"
+                "For more information Check out the section `Parallel adjoint differentiation support` in our website https://docs.pennylane.ai/projects/lightning/en/stable/lightning_gpu/device.html for more details.",
+                RuntimeWarning,
+            )
+
+    def _adjoint_jacobian_dtype(self):
+        """Binding to Lightning GPU Adjoint Jacobian C++ class.
+
+        Returns: the AdjointJacobian class
+        """
+        if self._use_mpi:
+            if not MPI_SUPPORT:
+                warn(str(mpi_error), UserWarning)
+
+            jacobian_lightning = (
+                AdjointJacobianMPIC64() if self.dtype == np.complex64 else AdjointJacobianMPIC128()
+            )
+            create_ops_list_lightning = (
+                create_ops_listMPIC64 if self.dtype == np.complex64 else create_ops_listMPIC128
+            )
+            return jacobian_lightning, create_ops_list_lightning
+
+        # without MPI
+        jacobian_lightning = (
+            AdjointJacobianC64() if self.dtype == np.complex64 else AdjointJacobianC128()
+        )
+        create_ops_list_lightning = (
+            create_ops_listC64 if self.dtype == np.complex64 else create_ops_listC128
+        )
+        return jacobian_lightning, create_ops_list_lightning
+
+    def _process_jacobian_tape(
+        self, tape: QuantumTape, split_obs: bool = False, use_mpi: bool = False
+    ):
+        """Process a tape, serializing and building a dictionary proper for
+        the adjoint Jacobian calculation in the C++ layer.
+
+        Args:
+            tape (QuantumTape): Operations and measurements that represent instructions for execution on Lightning.
+            split_obs (bool, optional): If splitting the observables in a list. Defaults to False.
+            use_mpi (bool, optional): If distributing computation with MPI. Defaults to False.
+
+        Returns:
+            dictionary: dictionary providing serialized data for Jacobian calculation.
+        """
+        use_csingle = self._qubit_state.dtype == np.complex64
+
+        obs_serialized, obs_indices = QuantumScriptSerializer(
+            self._qubit_state.device_name, use_csingle, use_mpi, split_obs
+        ).serialize_observables(tape)
+
+        ops_serialized, use_sp = QuantumScriptSerializer(
+            self._qubit_state.device_name, use_csingle, use_mpi, split_obs
+        ).serialize_ops(tape)
+
+        ops_serialized = self._create_ops_list_lightning(*ops_serialized)
+
+        # We need to filter out indices in trainable_params which do not
+        # correspond to operators.
+        trainable_params = sorted(tape.trainable_params)
+        if len(trainable_params) == 0:
+            return None
+
+        tp_shift = []
+        record_tp_rows = []
+        all_params = 0
+
+        for op_idx, trainable_param in enumerate(trainable_params):
+            # get op_idx-th operator among differentiable operators
+            operation, _, _ = tape.get_operation(op_idx)
+            if isinstance(operation, Operation) and not isinstance(
+                operation, (BasisState, StatePrep)
+            ):
+                # We now just ignore non-op or state preps
+                tp_shift.append(trainable_param)
+                record_tp_rows.append(all_params)
+            all_params += 1
+
+        if use_sp:
+            # When the first element of the tape is state preparation. Still, I am not sure
+            # whether there must be only one state preparation...
+            tp_shift = [i - 1 for i in tp_shift]
+
+        return {
+            "state_vector": self.state,
+            "obs_serialized": obs_serialized,
+            "ops_serialized": ops_serialized,
+            "tp_shift": tp_shift,
+            "record_tp_rows": record_tp_rows,
+            "all_params": all_params,
+            "obs_indices": obs_indices,
+        }
+
+    def calculate_jacobian(self, tape: QuantumTape):
+        """Computes the Jacobian with the adjoint method.
+
+        .. code-block:: python
+
+            statevector = LightningGPUStateVector(num_wires=num_wires)
+            statevector = statevector.get_final_state(tape)
+            jacobian = LightningGPUAdjointJacobian(statevector).calculate_jacobian(tape)
+
+        Args:
+            tape (QuantumTape): Operations and measurements that represent instructions for execution on Lightning.
+
+        Returns:
+            The Jacobian of a tape.
+        """
+
+        empty_array = self._handle_raises(tape, is_jacobian=True)
+
+        if empty_array:
+            return np.array([], dtype=self.dtype)
+
+        if self._use_mpi:
+            split_obs = False  # with MPI batched means compute Jacobian one observables at a time, no point splitting linear combinations
+        else:
+            split_obs = self._dp.getTotalDevices() if self._batch_obs else False
+
+        processed_data = self._process_jacobian_tape(tape, split_obs, self._use_mpi)
+
+        if not processed_data:  # training_params is empty
+            return np.array([], dtype=self.dtype)
+
+        trainable_params = processed_data["tp_shift"]
+
+        if self._batch_obs:  # Batching of Measurements
+            jac = self._jacobian_lightning.batched(
+                processed_data["state_vector"],
+                processed_data["obs_serialized"],
+                processed_data["ops_serialized"],
+                trainable_params,
+            )
+        else:
+            jac = self._jacobian_lightning(
+                processed_data["state_vector"],
+                processed_data["obs_serialized"],
+                processed_data["ops_serialized"],
+                trainable_params,
+            )
+
+        jac = np.array(jac)
+        has_shape0 = bool(len(jac))
+
+        num_obs = len(np.unique(processed_data["obs_indices"]))
+        rows = processed_data["obs_indices"]
+        cols = np.arange(len(rows), dtype=int)
+        data = np.ones(len(rows))
+        red_mat = csr_matrix((data, (rows, cols)), shape=(num_obs, len(rows)))
+        jac = red_mat @ jac.reshape((len(rows), -1))
+        jac = jac.reshape(-1, len(trainable_params)) if has_shape0 else jac
+        jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
+        jac_r[:, processed_data["record_tp_rows"]] = jac
+        return self._adjoint_jacobian_processing(jac_r)
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
new file mode 100644
index 0000000000..4b95762ccc
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -0,0 +1,202 @@
+# Copyright 2018-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Class implementation for state vector measurements.
+"""
+
+from __future__ import annotations
+
+from warnings import warn
+
+try:
+    from pennylane_lightning.lightning_gpu_ops import MeasurementsC64, MeasurementsC128
+
+    try:
+        from pennylane_lightning.lightning_gpu_ops import MeasurementsMPIC64, MeasurementsMPIC128
+
+        mpi_error = None
+        MPI_SUPPORT = True
+    except ImportError as ex_mpi:
+        mpi_error = ex_mpi
+        MPI_SUPPORT = False
+
+except ImportError as error_import:
+    warn(str(error_import), UserWarning)
+
+from typing import List
+
+import numpy as np
+import pennylane as qml
+from pennylane.measurements import CountsMP, MeasurementProcess, SampleMeasurement, Shots
+from pennylane.typing import TensorLike
+
+# pylint: disable=ungrouped-imports
+from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
+from pennylane_lightning.core._serialize import QuantumScriptSerializer
+
+
+class LightningGPUMeasurements(LightningBaseMeasurements):  # pylint: disable=too-few-public-methods
+    """Lightning GPU Measurements class
+
+    Measures the state provided by the LightningGPUStateVector class.
+
+    Args:
+        qubit_state(LightningGPUStateVector): Lightning state-vector class containing the state vector to be measured.
+    """
+
+    def __init__(
+        self,
+        qubit_state: LightningGPUStateVector,  # pylint: disable=undefined-variable
+    ) -> TensorLike:
+
+        super().__init__(qubit_state)
+
+        self._use_mpi = qubit_state._mpi_handler.use_mpi
+
+        if self._use_mpi:
+            self._mpi_handler = qubit_state._mpi_handler
+            self._num_local_wires = qubit_state._mpi_handler.num_local_wires
+
+        self._measurement_lightning = self._measurement_dtype()(qubit_state.state_vector)
+
+    def _measurement_dtype(self):
+        """Binding to Lightning GPU Measurements C++ class.
+
+        Returns: the Measurements class
+        """
+        if self._use_mpi:
+            if not MPI_SUPPORT:
+                warn(str(mpi_error), UserWarning)
+
+            return MeasurementsMPIC128 if self.dtype == np.complex128 else MeasurementsMPIC64
+
+        # without MPI
+        return MeasurementsC128 if self.dtype == np.complex128 else MeasurementsC64
+
+    def _measure_with_samples_diagonalizing_gates(
+        self,
+        mps: List[SampleMeasurement],
+        shots: Shots,
+    ) -> TensorLike:
+        """
+        Returns the samples of the measurement process performed on the given state,
+        by rotating the state into the measurement basis using the diagonalizing gates
+        given by the measurement process.
+
+        Args:
+            mps (~.measurements.SampleMeasurement): The sample measurements to perform
+            shots (~.measurements.Shots): The number of samples to take
+
+        Returns:
+            TensorLike[Any]: Sample measurement results
+        """
+        # apply diagonalizing gates
+        self._apply_diagonalizing_gates(mps)
+
+        # Specific for LGPU:
+        total_indices = self._qubit_state.num_wires
+        wires = qml.wires.Wires(range(total_indices))
+
+        def _process_single_shot(samples):
+            processed = []
+            for mp in mps:
+                res = mp.process_samples(samples, wires)
+                if not isinstance(mp, CountsMP):
+                    res = qml.math.squeeze(res)
+
+                processed.append(res)
+
+            return tuple(processed)
+
+        try:
+            samples = self._measurement_lightning.generate_samples(
+                len(wires), shots.total_shots
+            ).astype(int, copy=False)
+
+        except ValueError as ex:
+            if str(ex) != "probabilities contain NaN":
+                raise ex
+            samples = qml.math.full((shots.total_shots, len(wires)), 0)
+
+        self._apply_diagonalizing_gates(mps, adjoint=True)
+
+        # if there is a shot vector, use the shots.bins generator to
+        # split samples w.r.t. the shots
+        processed_samples = []
+        for lower, upper in shots.bins():
+            result = _process_single_shot(samples[..., lower:upper, :])
+            processed_samples.append(result)
+
+        return (
+            tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
+        )
+
+    def expval(self, measurementprocess: MeasurementProcess):
+        """Expectation value of the supplied observable contained in the MeasurementProcess.
+
+        Args:
+            measurementprocess (StateMeasurement): measurement to apply to the state
+
+        Returns:
+            Expectation value of the observable
+        """
+
+        if isinstance(measurementprocess.obs, qml.SparseHamiltonian):
+            # ensuring CSR sparse representation.
+
+            if self._use_mpi:
+                # Identity for CSR_SparseHamiltonian to pass to processes with rank != 0 to reduce
+                # host(cpu) memory requirements
+                obs = qml.Identity(0)
+                Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
+                H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
+                CSR_SparseHamiltonian = H_sparse.sparse_matrix().tocsr()
+                # CSR_SparseHamiltonian for rank == 0
+                if self._mpi_handler.mpi_manager.getRank() == 0:
+                    CSR_SparseHamiltonian = measurementprocess.obs.sparse_matrix().tocsr()
+            else:
+                CSR_SparseHamiltonian = measurementprocess.obs.sparse_matrix(
+                    wire_order=list(range(self._qubit_state.num_wires))
+                ).tocsr(copy=False)
+
+            return self._measurement_lightning.expval(
+                CSR_SparseHamiltonian.indptr,
+                CSR_SparseHamiltonian.indices,
+                CSR_SparseHamiltonian.data,
+            )
+
+        # use specialized functors to compute expval(Hermitian)
+        if isinstance(measurementprocess.obs, qml.Hermitian):
+            observable_wires = measurementprocess.obs.wires
+            if self._use_mpi and len(observable_wires) > self._num_local_wires:
+                raise RuntimeError(
+                    "MPI backend does not support Hermitian with number of target wires larger than local wire number."
+                )
+            matrix = measurementprocess.obs.matrix()
+            return self._measurement_lightning.expval(matrix, observable_wires)
+
+        if (
+            isinstance(measurementprocess.obs, qml.ops.Hamiltonian)
+            or (measurementprocess.obs.arithmetic_depth > 0)
+            or isinstance(measurementprocess.obs.name, List)
+        ):
+            # pylint: disable=protected-access
+            ob_serialized = QuantumScriptSerializer(
+                self._qubit_state.device_name, self.dtype == np.complex64, self._use_mpi
+            )._ob(measurementprocess.obs)
+            return self._measurement_lightning.expval(ob_serialized)
+
+        return self._measurement_lightning.expval(
+            measurementprocess.obs.name, measurementprocess.obs.wires
+        )
diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
new file mode 100644
index 0000000000..0d569ebeb1
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains the :class:`~.LightningGPU_MPIHandler` class, a MPI handler to use LightningGPU device with multi-GPU on multi-node system.
+"""
+
+try:
+    # pylint: disable=no-name-in-module
+    from pennylane_lightning.lightning_gpu_ops import DevPool, DevTag, MPIManager
+
+    MPI_SUPPORT = True
+except ImportError:
+    MPI_SUPPORT = False
+
+from typing import Union
+
+import numpy as np
+
+
+# MPI options
+class MPIHandler:  # pylint: disable=too-few-public-methods
+    """MPI handler for PennyLane Lightning GPU device.
+
+    MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
+
+    Use the MPI library is necessary to initialize different variables and methods to handle the data across nodes and perform checks for memory allocation on each device.
+
+    Args:
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        num_wires (int): the number of wires to initialize the device with.
+        c_dtype (np.complex64, np.complex128): Datatypes for statevector representation.
+    """
+
+    def __init__(
+        self,
+        mpi: bool,
+        mpi_buf_size: int,
+        num_wires: int,
+        c_dtype: Union[np.complex64, np.complex128],
+    ) -> None:
+
+        self.use_mpi = mpi
+        self.mpi_buf_size = mpi_buf_size
+
+        self._dp = DevPool()
+
+        if self.use_mpi:
+
+            if not MPI_SUPPORT:
+                raise ImportError(
+                    "Pre-compiled binaries for lightning.gpu with MPI support are not available. "
+                    "To manually compile from source, follow the instructions at "
+                    "https://docs.pennylane.ai/projects/lightning/en/stable/dev/installation.html."
+                )
+
+            if mpi_buf_size < 0:
+                raise ValueError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
+
+            if mpi_buf_size > 0 and (mpi_buf_size & (mpi_buf_size - 1)):
+                raise ValueError(
+                    f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2."
+                )
+
+            # After check if all MPI parameters are ok
+            self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
+
+            # set the number of global and local wires
+            commSize = self.mpi_manager.getSize()
+            self.num_global_wires = commSize.bit_length() - 1
+            self.num_local_wires = num_wires - self.num_global_wires
+
+            self._check_memory_size(c_dtype, mpi_buf_size)
+
+        if not self.use_mpi:
+            self.num_local_wires = num_wires
+            self.num_global_wires = num_wires
+
+    def _mebibytesToBytes(self, mebibytes):
+        return mebibytes * 1024 * 1024
+
+    def _check_memory_size(self, c_dtype, mpi_buf_size):
+        # Memory size in bytes
+        sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
+        if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
+            raise RuntimeError("The MPI buffer size is larger than the local state vector size.")
+
+    def _mpi_init_helper(self, num_wires):
+        """Set up MPI checks and initializations."""
+
+        # initialize MPIManager and config check in the MPIManager ctor
+        mpi_manager = MPIManager()
+
+        # check if number of GPUs per node is larger than number of processes per node
+        numDevices = self._dp.getTotalDevices()
+        numProcsNode = mpi_manager.getSizeNode()
+
+        if numDevices < numProcsNode:
+            raise ValueError(
+                "Number of devices should be larger than or equal to the number of processes on each node."
+            )
+
+        # check if the process number is larger than number of statevector elements
+        if mpi_manager.getSize() > (1 << (num_wires - 1)):
+            raise ValueError(
+                "Number of processes should be smaller than the number of statevector elements."
+            )
+
+        # set GPU device
+        rank = mpi_manager.getRank()
+        deviceid = rank % numProcsNode
+        self._dp.setDeviceID(deviceid)
+        devtag = DevTag(deviceid)
+
+        return (mpi_manager, devtag)
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
new file mode 100644
index 0000000000..77e453778b
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -0,0 +1,351 @@
+# Copyright 2018-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Class implementation for lightning_gpu state-vector manipulation.
+"""
+from warnings import warn
+
+try:
+    from pennylane_lightning.lightning_gpu_ops import StateVectorC64, StateVectorC128
+
+    try:  # Try to import the MPI modules
+        from pennylane_lightning.lightning_gpu_ops import StateVectorMPIC64, StateVectorMPIC128
+
+        mpi_error = None
+        MPI_SUPPORT = True
+    except ImportError as ex_mpi:
+        mpi_error = ex_mpi
+        MPI_SUPPORT = False
+
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+from typing import Union
+
+import numpy as np
+import pennylane as qml
+from pennylane import DeviceError
+from pennylane.measurements import MidMeasureMP
+from pennylane.ops import Conditional
+from pennylane.ops.op_math import Adjoint
+from pennylane.tape import QuantumScript
+from pennylane.wires import Wires
+
+# pylint: disable=ungrouped-imports
+from pennylane_lightning.core._serialize import global_phase_diagonal
+from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
+
+from ._measurements import LightningGPUMeasurements
+from ._mpi_handler import MPIHandler
+
+gate_cache_needs_hash = (
+    qml.BlockEncode,
+    qml.ControlledQubitUnitary,
+    qml.DiagonalQubitUnitary,
+    qml.MultiControlledX,
+    qml.OrbitalRotation,
+    qml.PSWAP,
+    qml.QubitUnitary,
+)
+
+
+class LightningGPUStateVector(LightningBaseStateVector):
+    """Lightning GPU state-vector class.
+
+    Interfaces with C++ python binding methods for state-vector manipulation.
+
+    Args:
+        num_wires(int): the number of wires to initialize the device with
+        dtype: Datatypes for state-vector representation. Must be one of
+            ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
+        device_name(string): state vector device name. Options: ["lightning.gpu"]
+        mpi_handler(MPIHandler): MPI handler for PennyLane Lightning GPU device.
+            Provides functionality to distribute the state-vector to multiple devices.
+        use_async (bool): is host-device data copy asynchronized or not.
+    """
+
+    def __init__(
+        self,
+        num_wires: int,
+        dtype: Union[np.complex128, np.complex64] = np.complex128,
+        mpi_handler: MPIHandler = None,
+        use_async: bool = False,
+    ):
+
+        super().__init__(num_wires, dtype)
+
+        self._device_name = "lightning.gpu"
+
+        # Initialize GPU and MPI variables
+        if mpi_handler is None:
+            mpi_handler = MPIHandler(False, 0, num_wires, dtype)
+
+        self._num_global_wires = mpi_handler.num_global_wires
+        self._num_local_wires = mpi_handler.num_local_wires
+
+        self._mpi_handler = mpi_handler
+        self._use_async = use_async
+
+        # Initialize the state vector
+        if self._mpi_handler.use_mpi:  # using MPI
+            self._qubit_state = self._state_dtype()(
+                self._mpi_handler.mpi_manager,
+                self._mpi_handler.devtag,
+                self._mpi_handler.mpi_buf_size,
+                self._mpi_handler.num_global_wires,
+                self._mpi_handler.num_local_wires,
+            )
+        else:  # without MPI
+            self._qubit_state = self._state_dtype()(self.num_wires)
+
+    def _state_dtype(self):
+        """Binding to Lightning Managed state vector C++ class.
+
+        Returns: the state vector class
+        """
+        if self._mpi_handler.use_mpi:
+            if not MPI_SUPPORT:
+                warn(str(mpi_error), UserWarning)
+
+            return StateVectorMPIC128 if self.dtype == np.complex128 else StateVectorMPIC64
+
+        # without MPI
+        return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64
+
+    def syncD2H(self, state_vector, use_async: bool = False):
+        """Copy the state vector data on device to a state vector on the host provided by the user.
+        Args:
+            state_vector(array[complex]): the state vector array on host.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.c_type)
+        >>> dev.syncD2H(state_vector)
+        >>> print(state_vector)
+        [0.+0.j 1.+0.j]
+        """
+        self._qubit_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
+
+    @property
+    def state(self):
+        """Copy the state vector data from the device to the host.
+
+        A state vector Numpy array is explicitly allocated on the host to store and return the data.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> print(dev.state)
+        [0.+0.j 1.+0.j]
+        """
+        state = np.zeros(2**self._num_local_wires, dtype=self.dtype)
+        self.syncD2H(state)
+        return state
+
+    def syncH2D(self, state_vector, use_async: bool = False):
+        """Copy the state vector data on host provided by the user to the state vector on the device
+        Args:
+            state_vector(array[complex]): the state vector array on host.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=3)
+        >>> obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
+        >>> obs1 = qml.Identity(1)
+        >>> H = qml.Hamiltonian([1.0, 1.0], [obs1, obs])
+        >>> state_vector = np.array([0.0 + 0.0j, 0.0 + 0.1j, 0.1 + 0.1j, 0.1 + 0.2j,
+            0.2 + 0.2j, 0.3 + 0.3j, 0.3 + 0.4j, 0.4 + 0.5j,], dtype=np.complex64,)
+        >>> dev.syncH2D(state_vector)
+        >>> res = dev.expval(H)
+        >>> print(res)
+        1.0
+        """
+        self._qubit_state.HostToDevice(state_vector.ravel(order="C"), use_async)
+
+    @staticmethod
+    def _asarray(arr, dtype=None):
+        arr = np.asarray(arr)  # arr is not copied
+
+        if arr.dtype.kind not in ["f", "c"]:
+            return arr
+
+        if not dtype:
+            dtype = arr.dtype
+
+        return arr
+
+    def _apply_state_vector(self, state, device_wires, use_async: bool = False):
+        """Initialize the state vector on GPU with a specified state on host.
+        Note that any use of this method will introduce host-overheads.
+        Args:
+        state (array[complex]): normalized input state (on host) of length ``2**len(wires)``
+                or broadcasted state of shape ``(batch_size, 2**len(wires))``
+        device_wires (Wires): wires that get initialized in the state
+        use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+        Note: This function only supports synchronized memory copy from host to device.
+        """
+
+        if isinstance(state, self._qubit_state.__class__):
+            raise DeviceError("LightningGPU does not support allocate external state_vector.")
+
+            # TODO
+            # Create an implementation in the C++ backend and binding to be able
+            # to allocate memory for a new statevector and copy the data
+            # from an external state vector.
+            # state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
+            # state.getState(state_data)
+            # state = state_data
+
+        state = self._asarray(state, dtype=self.dtype)  # this operation on host
+        output_shape = [2] * self._num_local_wires
+
+        if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
+            # Initialize the entire device state with the input state
+            if self.num_wires == self._num_local_wires:
+                self.syncH2D(np.reshape(state, output_shape))
+                return
+            local_state = np.zeros(2**self._num_local_wires, dtype=self._dtype)
+            self._mpi_handler.mpi_manager.Scatter(state, local_state, 0)
+            self.syncH2D(np.reshape(local_state, output_shape))
+            return
+
+        # set the state vector on GPU with provided state and their corresponding wires
+        self._qubit_state.setStateVector(state, list(device_wires), use_async)
+
+    def _apply_lightning_controlled(self, operation):
+        """Apply an arbitrary controlled operation to the state tensor.
+
+        Args:
+            operation (~pennylane.operation.Operation): controlled operation to apply
+
+        Returns:
+            None
+        """
+        state = self.state_vector
+
+        control_wires = list(operation.control_wires)
+        control_values = operation.control_values
+        name = operation.name
+        # Apply GlobalPhase
+        inv = False
+        param = operation.parameters[0]
+        wires = self.wires.indices(operation.wires)
+        matrix = global_phase_diagonal(param, self.wires, control_wires, control_values)
+        state.apply(name, wires, inv, [[param]], matrix)
+
+    def _apply_lightning_midmeasure(
+        self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str
+    ):
+        """Execute a MidMeasureMP operation and return the sample in mid_measurements.
+
+        Args:
+            operation (~pennylane.operation.Operation): mid-circuit measurement
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots.
+
+        Returns:
+            None
+        """
+        wires = self.wires.indices(operation.wires)
+        wire = list(wires)[0]
+        if postselect_mode == "fill-shots" and operation.postselect is not None:
+            sample = operation.postselect
+        else:
+            circuit = QuantumScript([], [qml.sample(wires=operation.wires)], shots=1)
+            sample = LightningGPUMeasurements(self).measure_final_state(circuit)
+            sample = np.squeeze(sample)
+        mid_measurements[operation] = sample
+        getattr(self.state_vector, "collapse")(wire, bool(sample))
+        if operation.reset and bool(sample):
+            self.apply_operations([qml.PauliX(operation.wires)], mid_measurements=mid_measurements)
+
+    # pylint: disable=unused-argument
+    def _apply_lightning(
+        self, operations, mid_measurements: dict = None, postselect_mode: str = None
+    ):
+        """Apply a list of operations to the state vector.
+
+        Args:
+            operations (list[~pennylane.operation.Operation]): operations to apply
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots. Default is ``None``.
+
+        Returns:
+            None
+        """
+        state = self.state_vector
+
+        # Skip over identity operations instead of performing
+        # matrix multiplication with it.
+        for operation in operations:
+            if isinstance(operation, qml.Identity):
+                continue
+            if isinstance(operation, Adjoint):
+                name = operation.base.name
+                invert_param = True
+            else:
+                name = operation.name
+                invert_param = False
+            method = getattr(state, name, None)
+            wires = list(operation.wires)
+
+            if isinstance(operation, Conditional):
+                if operation.meas_val.concretize(mid_measurements):
+                    self._apply_lightning([operation.base])
+            elif isinstance(operation, MidMeasureMP):
+                self._apply_lightning_midmeasure(
+                    operation, mid_measurements, postselect_mode=postselect_mode
+                )
+            elif method is not None:  # apply specialized gate
+                param = operation.parameters
+                method(wires, invert_param, param)
+            elif isinstance(operation, qml.ops.Controlled) and isinstance(
+                operation.base, qml.GlobalPhase
+            ):  # apply n-controlled gate
+                # LGPU do not support the controlled gates except for GlobalPhase
+                self._apply_lightning_controlled(operation)
+            else:  # apply gate as a matrix
+                try:
+                    mat = qml.matrix(operation)
+                except AttributeError:  # pragma: no cover
+                    # To support older versions of PL
+                    mat = operation.matrix
+
+                r_dtype = np.float32 if self.dtype == np.complex64 else np.float64
+                param = (
+                    [[r_dtype(operation.hash)]]
+                    if isinstance(operation, gate_cache_needs_hash)
+                    else []
+                )
+                if len(mat) == 0:
+                    raise ValueError("Unsupported operation")
+
+                self._qubit_state.apply(
+                    name,
+                    wires,
+                    False,
+                    param,
+                    mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
+                )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 2894b999f3..56454613cc 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -16,921 +16,524 @@
 This module contains the :class:`~.LightningGPU` class, a PennyLane simulator device that
 interfaces with the NVIDIA cuQuantum cuStateVec simulator library for GPU-enabled calculations.
 """
+from __future__ import annotations
 
 from ctypes.util import find_library
+from dataclasses import replace
 from importlib import util as imp_util
-from itertools import product
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union
 from warnings import warn
 
 import numpy as np
 import pennylane as qml
-from pennylane import BasisState, DeviceError, QuantumFunctionError, Rot, StatePrep, math
-from pennylane.measurements import Expectation, State
-from pennylane.ops.op_math import Adjoint
-from pennylane.wires import Wires
-from scipy.sparse import csr_matrix
-
-from pennylane_lightning.core._serialize import QuantumScriptSerializer, global_phase_diagonal
-from pennylane_lightning.core._version import __version__
-
-# pylint: disable=import-error, no-name-in-module, ungrouped-imports
-from pennylane_lightning.core.lightning_base import LightningBase
+from pennylane.devices import DefaultExecutionConfig, ExecutionConfig
+from pennylane.devices.default_qubit import adjoint_ops
+from pennylane.devices.modifiers import simulator_tracking, single_tape_support
+from pennylane.devices.preprocess import (
+    decompose,
+    mid_circuit_measurements,
+    no_sampling,
+    validate_adjoint_trainable_params,
+    validate_device_wires,
+    validate_measurements,
+    validate_observables,
+)
+from pennylane.measurements import MidMeasureMP
+from pennylane.operation import DecompositionUndefinedError, Operator, Tensor
+from pennylane.ops import Prod, SProd, Sum
+from pennylane.tape import QuantumScript
+from pennylane.transforms.core import TransformProgram
+from pennylane.typing import Result
+
+from pennylane_lightning.core.lightning_newAPI_base import (
+    LightningBase,
+    QuantumTape_or_Batch,
+    Result_or_ResultBatch,
+)
 
 try:
     from pennylane_lightning.lightning_gpu_ops import (
         DevPool,
-        MeasurementsC64,
-        MeasurementsC128,
-        StateVectorC64,
-        StateVectorC128,
         backend_info,
         get_gpu_arch,
         is_gpu_supported,
     )
-    from pennylane_lightning.lightning_gpu_ops.algorithms import (
-        AdjointJacobianC64,
-        AdjointJacobianC128,
-        create_ops_listC64,
-        create_ops_listC128,
-    )
-
-    try:
-        # pylint: disable=no-name-in-module
-        from pennylane_lightning.lightning_gpu_ops import (
-            DevTag,
-            MeasurementsMPIC64,
-            MeasurementsMPIC128,
-            MPIManager,
-            StateVectorMPIC64,
-            StateVectorMPIC128,
-        )
-        from pennylane_lightning.lightning_gpu_ops.algorithmsMPI import (
-            AdjointJacobianMPIC64,
-            AdjointJacobianMPIC128,
-            create_ops_listMPIC64,
-            create_ops_listMPIC128,
-        )
-
-        MPI_SUPPORT = True
-    except ImportError as ex:
-        warn(str(ex), UserWarning)
-        MPI_SUPPORT = False
-
-    if find_library("custatevec") is None and not imp_util.find_spec(
-        "cuquantum"
-    ):  # pragma: no cover
-        raise ImportError(
-            "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
-        )
-    if not DevPool.getTotalDevices():  # pragma: no cover
-        raise ValueError("No supported CUDA-capable device found")
-
-    if not is_gpu_supported():  # pragma: no cover
-        raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
 
     LGPU_CPP_BINARY_AVAILABLE = True
+
 except (ImportError, ValueError) as ex:
     warn(str(ex), UserWarning)
-    backend_info = None
     LGPU_CPP_BINARY_AVAILABLE = False
+    backend_info = None
 
-
-def _gpu_dtype(dtype, mpi=False):
-    if dtype not in [np.complex128, np.complex64]:  # pragma: no cover
-        raise ValueError(f"Data type is not supported for state-vector computation: {dtype}")
-    if mpi:
-        return StateVectorMPIC128 if dtype == np.complex128 else StateVectorMPIC64
-    return StateVectorC128 if dtype == np.complex128 else StateVectorC64
-
-
-def _adj_dtype(use_csingle, mpi=False):
-    if mpi:
-        return AdjointJacobianMPIC64 if use_csingle else AdjointJacobianMPIC128
-    return AdjointJacobianC64 if use_csingle else AdjointJacobianC128
-
-
-def _mebibytesToBytes(mebibytes):
-    return mebibytes * 1024 * 1024
-
-
-allowed_operations = {
-    "Identity",
-    "BasisState",
-    "QubitStateVector",
-    "StatePrep",
-    "QubitUnitary",
-    "ControlledQubitUnitary",
-    "MultiControlledX",
-    "DiagonalQubitUnitary",
-    "PauliX",
-    "PauliY",
-    "PauliZ",
-    "MultiRZ",
-    "GlobalPhase",
-    "C(GlobalPhase)",
-    "Hadamard",
-    "S",
-    "Adjoint(S)",
-    "T",
-    "Adjoint(T)",
-    "SX",
-    "Adjoint(SX)",
-    "CNOT",
-    "SWAP",
-    "ISWAP",
-    "PSWAP",
-    "Adjoint(ISWAP)",
-    "SISWAP",
-    "Adjoint(SISWAP)",
-    "SQISW",
-    "CSWAP",
-    "Toffoli",
-    "CY",
-    "CZ",
-    "PhaseShift",
-    "ControlledPhaseShift",
-    "RX",
-    "RY",
-    "RZ",
-    "Rot",
-    "CRX",
-    "CRY",
-    "CRZ",
-    "CRot",
-    "IsingXX",
-    "IsingYY",
-    "IsingZZ",
-    "IsingXY",
-    "SingleExcitation",
-    "SingleExcitationPlus",
-    "SingleExcitationMinus",
-    "DoubleExcitation",
-    "DoubleExcitationPlus",
-    "DoubleExcitationMinus",
-    "QubitCarry",
-    "QubitSum",
-    "OrbitalRotation",
-    "ECR",
-    "BlockEncode",
-    "C(BlockEncode)",
-}
-
-allowed_observables = {
-    "PauliX",
-    "PauliY",
-    "PauliZ",
-    "Hadamard",
-    "SparseHamiltonian",
-    "Hamiltonian",
-    "LinearCombination",
-    "Hermitian",
-    "Identity",
-    "Projector",
-    "Sum",
-    "Prod",
-    "SProd",
-}
-
-gate_cache_needs_hash = (
-    qml.BlockEncode,
-    qml.ControlledQubitUnitary,
-    qml.DiagonalQubitUnitary,
-    qml.MultiControlledX,
-    qml.OrbitalRotation,
-    qml.PSWAP,
-    qml.QubitUnitary,
+from ._adjoint_jacobian import LightningGPUAdjointJacobian
+from ._measurements import LightningGPUMeasurements
+from ._mpi_handler import MPIHandler
+from ._state_vector import LightningGPUStateVector
+
+# The set of supported operations.
+_operations = frozenset(
+    {
+        "Identity",
+        "QubitStateVector",
+        "QubitUnitary",
+        "ControlledQubitUnitary",
+        "MultiControlledX",
+        "DiagonalQubitUnitary",
+        "PauliX",
+        "PauliY",
+        "PauliZ",
+        "MultiRZ",
+        "GlobalPhase",
+        "C(GlobalPhase)",
+        "Hadamard",
+        "S",
+        "Adjoint(S)",
+        "T",
+        "Adjoint(T)",
+        "SX",
+        "Adjoint(SX)",
+        "CNOT",
+        "SWAP",
+        "ISWAP",
+        "PSWAP",
+        "Adjoint(ISWAP)",
+        "SISWAP",
+        "Adjoint(SISWAP)",
+        "SQISW",
+        "CSWAP",
+        "Toffoli",
+        "CY",
+        "CZ",
+        "PhaseShift",
+        "ControlledPhaseShift",
+        "RX",
+        "RY",
+        "RZ",
+        "Rot",
+        "CRX",
+        "CRY",
+        "CRZ",
+        "CRot",
+        "IsingXX",
+        "IsingYY",
+        "IsingZZ",
+        "IsingXY",
+        "SingleExcitation",
+        "SingleExcitationPlus",
+        "SingleExcitationMinus",
+        "DoubleExcitation",
+        "DoubleExcitationPlus",
+        "DoubleExcitationMinus",
+        "QubitCarry",
+        "QubitSum",
+        "OrbitalRotation",
+        "ECR",
+        "BlockEncode",
+        "C(BlockEncode)",
+    }
+)
+# End the set of supported operations.
+
+# The set of supported observables.
+_observables = frozenset(
+    {
+        "PauliX",
+        "PauliY",
+        "PauliZ",
+        "Hadamard",
+        "SparseHamiltonian",
+        "Hamiltonian",
+        "LinearCombination",
+        "Hermitian",
+        "Identity",
+        "Projector",
+        "Sum",
+        "Prod",
+        "SProd",
+    }
 )
 
 
-class LightningGPU(LightningBase):  # pylint: disable=too-many-instance-attributes
-    """PennyLane Lightning GPU device.
+def stopping_condition(op: Operator) -> bool:
+    """A function that determines whether or not an operation is supported by ``lightning.gpu``."""
+    # To avoid building matrices beyond the given thresholds.
+    # This should reduce runtime overheads for larger systems.
+    if isinstance(op, qml.QFT):
+        return len(op.wires) < 10
+    if isinstance(op, qml.GroverOperator):
+        return len(op.wires) < 13
+    if isinstance(op, qml.PauliRot):
+        return False
 
-    A GPU-backed Lightning device using NVIDIA cuQuantum SDK.
+    return op.name in _operations
 
-    Use of this device requires pre-built binaries or compilation from source. Check out the
-    :doc:`/lightning_gpu/installation` guide for more details.
 
-    Args:
-        wires (int): the number of wires to initialize the device with
-        mpi (bool): enable MPI support. MPI support will be enabled if ``mpi`` is set as``True``.
-        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
-        sync (bool): immediately sync with host-sv after applying operations
-        c_dtype: Datatypes for statevector representation. Must be one of ``np.complex64`` or ``np.complex128``.
-        shots (int): How many times the circuit should be evaluated (or sampled) to estimate
-            the expectation values. Defaults to ``None`` if not specified. Setting
-            to ``None`` results in computing statistics like expectation values and
-            variances analytically.
-        batch_obs (Union[bool, int]): determine whether to use multiple GPUs within the same node or not
-    """
-
-    name = "Lightning GPU PennyLane plugin"
-    short_name = "lightning.gpu"
+def stopping_condition_shots(op: Operator) -> bool:
+    """A function that determines whether or not an operation is supported by ``lightning.gpu``
+    with finite shots."""
+    return stopping_condition(op) or isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional))
 
-    operations = allowed_operations
-    observables = allowed_observables
-    _backend_info = backend_info
-    config = Path(__file__).parent / "lightning_gpu.toml"
-    _CPP_BINARY_AVAILABLE = LGPU_CPP_BINARY_AVAILABLE
 
-    def __init__(
-        self,
-        wires,
-        *,
-        mpi: bool = False,
-        mpi_buf_size: int = 0,
-        sync=False,
-        c_dtype=np.complex128,
-        shots=None,
-        batch_obs: Union[bool, int] = False,
-    ):  # pylint: disable=too-many-arguments
-        if c_dtype is np.complex64:
-            self.use_csingle = True
-        elif c_dtype is np.complex128:
-            self.use_csingle = False
-        else:
-            raise TypeError(f"Unsupported complex type: {c_dtype}")
-
-        super().__init__(wires, shots=shots, c_dtype=c_dtype)
+def accepted_observables(obs: Operator) -> bool:
+    """A function that determines whether or not an observable is supported by ``lightning.gpu``."""
+    return obs.name in _observables
 
-        self._dp = DevPool()
 
-        if not mpi:
-            self._mpi = False
-            self._num_local_wires = self.num_wires
-            self._gpu_state = _gpu_dtype(c_dtype)(self._num_local_wires)
-        else:
-            self._mpi = True
-            self._mpi_init_helper(self.num_wires)
-
-            if mpi_buf_size < 0:
-                raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}")
-
-            if mpi_buf_size:
-                if mpi_buf_size & (mpi_buf_size - 1):
-                    raise TypeError(
-                        f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2."
-                    )
-                # Memory size in bytes
-                sv_memsize = np.dtype(c_dtype).itemsize * (1 << self._num_local_wires)
-                if _mebibytesToBytes(mpi_buf_size) > sv_memsize:
-                    w_msg = "The MPI buffer size is larger than the local state vector size."
-                    warn(
-                        w_msg,
-                        RuntimeWarning,
-                    )
+def adjoint_observables(obs: Operator) -> bool:
+    """A function that determines whether or not an observable is supported by ``lightning.gpu``
+    when using the adjoint differentiation method."""
+    if isinstance(obs, qml.Projector):
+        return False
 
-            self._gpu_state = _gpu_dtype(c_dtype, mpi)(
-                self._mpi_manager,
-                self._devtag,
-                mpi_buf_size,
-                self._num_global_wires,
-                self._num_local_wires,
-            )
+    if isinstance(obs, Tensor):
+        if any(isinstance(o, qml.Projector) for o in obs.non_identity_obs):
+            return False
+        return True
 
-        self._sync = sync
-        self._batch_obs = batch_obs
-        self._create_basis_state(0)
-
-    def _mpi_init_helper(self, num_wires):
-        """Set up MPI checks."""
-        if not MPI_SUPPORT:
-            raise ImportError("MPI related APIs are not found.")
-        # initialize MPIManager and config check in the MPIManager ctor
-        self._mpi_manager = MPIManager()
-        # check if number of GPUs per node is larger than
-        # number of processes per node
-        numDevices = self._dp.getTotalDevices()
-        numProcsNode = self._mpi_manager.getSizeNode()
-        if numDevices < numProcsNode:
-            raise ValueError(
-                "Number of devices should be larger than or equal to the number of processes on each node."
-            )
-        # check if the process number is larger than number of statevector elements
-        if self._mpi_manager.getSize() > (1 << (num_wires - 1)):
-            raise ValueError(
-                "Number of processes should be smaller than the number of statevector elements."
-            )
-        # set the number of global and local wires
-        commSize = self._mpi_manager.getSize()
-        self._num_global_wires = commSize.bit_length() - 1
-        self._num_local_wires = num_wires - self._num_global_wires
-        # set GPU device
-        rank = self._mpi_manager.getRank()
-        deviceid = rank % numProcsNode
-        self._dp.setDeviceID(deviceid)
-        self._devtag = DevTag(deviceid)
-
-    @staticmethod
-    def _asarray(arr, dtype=None):
-        arr = np.asarray(arr)  # arr is not copied
-
-        if arr.dtype.kind not in ["f", "c"]:
-            return arr
-
-        if not dtype:
-            dtype = arr.dtype
-
-        return arr
-
-    # pylint disable=missing-function-docstring
-    def reset(self):
-        """Reset the device"""
-        super().reset()
-        # init the state vector to |00..0>
-        self._gpu_state.resetGPU(False)  # Sync reset
+    if isinstance(obs, SProd):
+        return adjoint_observables(obs.base)
 
-    @property
-    def state(self):
-        # pylint disable=missing-function-docstring
-        """Copy the state vector data from the device to the host.
+    if isinstance(obs, (Sum, Prod)):
+        return all(adjoint_observables(o) for o in obs)
 
-        A state vector Numpy array is explicitly allocated on the host to store and return the data.
+    return obs.name in _observables
 
-        **Example**
 
-        >>> dev = qml.device('lightning.gpu', wires=1)
-        >>> dev.apply([qml.PauliX(wires=[0])])
-        >>> print(dev.state)
-        [0.+0.j 1.+0.j]
-        """
-        state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
-        state = self._asarray(state, dtype=self.C_DTYPE)
-        self.syncD2H(state)
-        return state
+def adjoint_measurements(mp: qml.measurements.MeasurementProcess) -> bool:
+    """Specifies whether or not an observable is compatible with adjoint differentiation on DefaultQubit."""
+    return isinstance(mp, qml.measurements.ExpectationMP)
 
-    @property
-    def create_ops_list(self):
-        """Returns create_ops_list function of the matching precision."""
-        if self._mpi:
-            return create_ops_listMPIC64 if self.use_csingle else create_ops_listMPIC128
-        return create_ops_listC64 if self.use_csingle else create_ops_listC128
 
-    @property
-    def measurements(self):
-        """Returns Measurements constructor of the matching precision."""
-        if self._mpi:
-            return (
-                MeasurementsMPIC64(self._gpu_state)
-                if self.use_csingle
-                else MeasurementsMPIC128(self._gpu_state)
-            )
-        return (
-            MeasurementsC64(self._gpu_state)
-            if self.use_csingle
-            else MeasurementsC128(self._gpu_state)
-        )
+def _supports_adjoint(circuit):
+    if circuit is None:
+        return True
 
-    def syncD2H(self, state_vector, use_async=False):
-        """Copy the state vector data on device to a state vector on the host provided by the user
-        Args:
-            state_vector(array[complex]): the state vector array on host
-            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-            Note: This function only supports synchronized memory copy.
-
-        **Example**
-        >>> dev = qml.device('lightning.gpu', wires=1)
-        >>> dev.apply([qml.PauliX(wires=[0])])
-        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.C_DTYPE)
-        >>> dev.syncD2H(state_vector)
-        >>> print(state_vector)
-        [0.+0.j 1.+0.j]
-        """
-        self._gpu_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
+    prog = TransformProgram()
+    _add_adjoint_transforms(prog)
 
-    def syncH2D(self, state_vector, use_async=False):
-        """Copy the state vector data on host provided by the user to the state vector on the device
-        Args:
-            state_vector(array[complex]): the state vector array on host.
-            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-            Note: This function only supports synchronized memory copy.
-
-        **Example**
-        >>> dev = qml.device('lightning.gpu', wires=3)
-        >>> obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
-        >>> obs1 = qml.Identity(1)
-        >>> H = qml.Hamiltonian([1.0, 1.0], [obs1, obs])
-        >>> state_vector = np.array([0.0 + 0.0j, 0.0 + 0.1j, 0.1 + 0.1j, 0.1 + 0.2j,
-            0.2 + 0.2j, 0.3 + 0.3j, 0.3 + 0.4j, 0.4 + 0.5j,], dtype=np.complex64,)
-        >>> dev.syncH2D(state_vector)
-        >>> res = dev.expval(H)
-        >>> print(res)
-        1.0
-        """
-        self._gpu_state.HostToDevice(state_vector.ravel(order="C"), use_async)
+    try:
+        prog((circuit,))
+    except (DecompositionUndefinedError, qml.DeviceError, AttributeError):
+        return False
+    return True
 
-    def _create_basis_state(self, index, use_async=False):
-        """Return a computational basis state over all wires.
-        Args:
-            index (int): integer representing the computational basis state.
-            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-            Note: This function only supports synchronized memory copy.
-        """
-        self._gpu_state.setBasisState(index, use_async)
 
-    def _apply_state_vector(self, state, device_wires, use_async=False):
-        """Initialize the state vector on GPU with a specified state on host.
-        Note that any use of this method will introduce host-overheads.
-        Args:
-        state (array[complex]): normalized input state (on host) of length ``2**len(wires)``
-                or broadcasted state of shape ``(batch_size, 2**len(wires))``
-        device_wires (Wires): wires that get initialized in the state
-        use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-        Note: This function only supports synchronized memory copy from host to device.
-        """
-        # translate to wire labels used by device
-        device_wires = self.map_wires(device_wires)
-
-        state = self._asarray(state, dtype=self.C_DTYPE)  # this operation on host
-        output_shape = [2] * self._num_local_wires
-
-        if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
-            # Initialize the entire device state with the input state
-            if self.num_wires == self._num_local_wires:
-                self.syncH2D(self._reshape(state, output_shape))
-                return
-            local_state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
-            self._mpi_manager.Scatter(state, local_state, 0)
-            # Initialize the entire device state with the input state
-            self.syncH2D(self._reshape(local_state, output_shape))
-            return
-
-        # generate basis states on subset of qubits via the cartesian product
-        basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
-
-        # get basis states to alter on full set of qubits
-        unravelled_indices = np.zeros((2 ** len(device_wires), self.num_wires), dtype=int)
-        unravelled_indices[:, device_wires] = basis_states
-
-        # get indices for which the state is changed to input state vector elements
-        ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires)
-
-        # set the state vector on GPU with the unravelled_indices and their corresponding values
-        self._gpu_state.setStateVector(
-            ravelled_indices, state, use_async
-        )  # this operation on device
-
-    def _apply_basis_state(self, state, wires):
-        """Initialize the state vector in a specified computational basis state on GPU directly.
-            Args:
-            state (array[int]): computational basis state (on host) of shape ``(wires,)``
-                consisting of 0s and 1s.
-            wires (Wires): wires that the provided computational state should be initialized on
-        Note: This function does not support broadcasted inputs yet.
-        """
-        # translate to wire labels used by device
-        device_wires = self.map_wires(wires)
+def _adjoint_ops(op: qml.operation.Operator) -> bool:
+    """Specify whether or not an Operator is supported by adjoint differentiation."""
+    return not isinstance(op, qml.PauliRot) and adjoint_ops(op)
 
-        # length of basis state parameter
-        n_basis_state = len(state)
-        state = state.tolist() if hasattr(state, "tolist") else state
-        if not set(state).issubset({0, 1}):
-            raise ValueError("BasisState parameter must consist of 0 or 1 integers.")
 
-        if n_basis_state != len(device_wires):
-            raise ValueError("BasisState parameter and wires must be of equal length.")
+def _add_adjoint_transforms(program: TransformProgram) -> None:
+    """Private helper function for ``preprocess`` that adds the transforms specific
+    for adjoint differentiation.
 
-        # get computational basis state number
-        basis_states = 2 ** (self.num_wires - 1 - np.array(device_wires))
-        basis_states = qml.math.convert_like(basis_states, state)
-        num = int(qml.math.dot(state, basis_states))
+    Args:
+        program (TransformProgram): where we will add the adjoint differentiation transforms
 
-        self._create_basis_state(num)
+    Side Effects:
+        Adds transforms to the input program.
 
-    def apply_lightning(self, operations):
-        """Apply a list of operations to the state tensor.
+    """
 
-        Args:
-            operations (list[~pennylane.operation.Operation]): operations to apply
-            dtype (type): Type of numpy ``complex`` to be used. Can be important
-            to specify for large systems for memory allocation purposes.
+    name = "adjoint + lightning.gpu"
+    program.add_transform(no_sampling, name=name)
+    program.add_transform(
+        decompose,
+        stopping_condition=_adjoint_ops,
+        stopping_condition_shots=stopping_condition_shots,
+        name=name,
+        skip_initial_state_prep=False,
+    )
+    program.add_transform(validate_observables, accepted_observables, name=name)
+    program.add_transform(
+        validate_measurements, analytic_measurements=adjoint_measurements, name=name
+    )
+    program.add_transform(qml.transforms.broadcast_expand)
+    program.add_transform(validate_adjoint_trainable_params)
 
-        Returns:
-            array[complex]: the output state tensor
-        """
-        # Skip over identity operations instead of performing
-        # matrix multiplication with the identity.
-        for ops in operations:
-            if isinstance(ops, qml.Identity):
-                continue
-            if isinstance(ops, Adjoint):
-                name = ops.base.name
-                invert_param = True
-            else:
-                name = ops.name
-                invert_param = False
-            method = getattr(self._gpu_state, name, None)
-            wires = self.wires.indices(ops.wires)
-
-            if isinstance(ops, qml.ops.op_math.Controlled) and isinstance(
-                ops.base, qml.GlobalPhase
-            ):
-                controls = ops.control_wires
-                control_values = ops.control_values
-                param = ops.base.parameters[0]
-                matrix = global_phase_diagonal(param, self.wires, controls, control_values)
-                self._gpu_state.apply(name, wires, False, [], matrix)
-            elif method is None:
-                # Inverse can be set to False since qml.matrix(ops) is already in inverted form
-                try:
-                    mat = qml.matrix(ops)
-                except AttributeError:  # pragma: no cover
-                    # To support older versions of PL
-                    mat = ops.matrix
-                r_dtype = np.float32 if self.use_csingle else np.float64
-                param = [[r_dtype(ops.hash)]] if isinstance(ops, gate_cache_needs_hash) else []
-                if len(mat) == 0:
-                    raise ValueError("Unsupported operation")
-                self._gpu_state.apply(
-                    name,
-                    wires,
-                    False,
-                    param,
-                    mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
-                )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
-
-            else:
-                param = ops.parameters
-                method(wires, invert_param, param)
 
-    # pylint: disable=unused-argument
-    def apply(self, operations, rotations=None, **kwargs):
-        """Applies a list of operations to the state tensor."""
-        # State preparation is currently done in Python
-        if operations:  # make sure operations[0] exists
-            if isinstance(operations[0], StatePrep):
-                self._apply_state_vector(operations[0].parameters[0].copy(), operations[0].wires)
-                operations = operations[1:]
-            elif isinstance(operations[0], BasisState):
-                self._apply_basis_state(operations[0].parameters[0], operations[0].wires)
-                operations = operations[1:]
-
-        for operation in operations:
-            if isinstance(operation, (StatePrep, BasisState)):
-                raise DeviceError(
-                    f"Operation {operation.name} cannot be used after other "
-                    + f"Operations have already been applied on a {self.short_name} device."
-                )
+# LightningGPU specific methods
+def check_gpu_resources() -> None:
+    """Check the available resources of each Nvidia GPU"""
+    if find_library("custatevec") is None and not imp_util.find_spec("cuquantum"):
 
-        self.apply_lightning(operations)
+        raise ImportError(
+            "cuStateVec libraries not found. Please pip install the appropriate cuStateVec library in a virtual environment."
+        )
 
-    @staticmethod
-    def _check_adjdiff_supported_operations(operations):
-        """Check Lightning adjoint differentiation method support for a tape.
+    if not DevPool.getTotalDevices():
+        raise ValueError("No supported CUDA-capable device found")
 
-        Raise ``QuantumFunctionError`` if ``tape`` contains not supported measurements,
-        observables, or operations by the Lightning adjoint differentiation method.
+    if not is_gpu_supported():
+        raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
 
-        Args:
-            tape (.QuantumTape): quantum tape to differentiate.
-        """
-        for op in operations:
-            if op.num_params > 1 and not isinstance(op, Rot):
-                raise QuantumFunctionError(
-                    f"The {op.name} operation is not supported using "
-                    'the "adjoint" differentiation method'
-                )
 
-    def _init_process_jacobian_tape(self, tape, starting_state, use_device_state):
-        """Generate an initial state vector for ``_process_jacobian_tape``."""
-        if starting_state is not None:
-            if starting_state.size != 2 ** len(self.wires):
-                raise QuantumFunctionError(
-                    "The number of qubits of starting_state must be the same as "
-                    "that of the device."
-                )
-            self._apply_state_vector(starting_state, self.wires)
-        elif not use_device_state:
-            self.reset()
-            self.apply(tape.operations)
-        return self._gpu_state
-
-    # pylint: disable=too-many-branches
-    def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
-        """Implements the adjoint method outlined in
-        `Jones and Gacon <https://arxiv.org/abs/2009.02823>`__ to differentiate an input tape.
-
-        After a forward pass, the circuit is reversed by iteratively applying adjoint
-        gates to scan backwards through the circuit.
-        """
-        if self.shots is not None:
-            warn(
-                "Requested adjoint differentiation to be computed with finite shots."
-                " The derivative is always exact when using the adjoint differentiation method.",
-                UserWarning,
-            )
+@simulator_tracking
+@single_tape_support
+class LightningGPU(LightningBase):
+    """PennyLane Lightning GPU device.
 
-        tape_return_type = self._check_adjdiff_supported_measurements(tape.measurements)
+    A device that interfaces with C++ to perform fast linear algebra calculations.
 
-        if not tape_return_type:  # the tape does not have measurements
-            return np.array([], dtype=self.state.dtype)
+    Use of this device requires pre-built binaries or compilation from source. Check out the
+    :doc:`/lightning_gpu/installation` guide for more details.
 
-        if tape_return_type is State:  # pragma: no cover
-            raise QuantumFunctionError(
-                "Adjoint differentiation method does not support measurement StateMP."
-                "Use vjp method instead for this purpose."
-            )
+    Args:
+        wires (int): the number of wires to initialize the device with
+        c_dtype: Datatypes for statevector representation. Must be one of
+            ``np.complex64`` or ``np.complex128``.
+        shots (int): How many times the circuit should be evaluated (or sampled) to estimate
+            the expectation values. Defaults to ``None`` if not specified. Setting
+            to ``None`` results in computing statistics like expectation values and
+            variances analytically.
+        batch_obs (bool): Determine whether we process observables in parallel when
+            computing the jacobian. This value is only relevant when the lightning.gpu
+            is built with MPI. Default is False.
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        use_async (bool): is host-device data copy asynchronized or not.
+    """
 
-        # Check adjoint diff support
-        self._check_adjdiff_supported_operations(tape.operations)
+    # General device options
+    _device_options = ("c_dtype", "batch_obs")
 
-        if self._mpi:
-            split_obs = False  # with MPI batched means compute Jacobian one observables at a time, no point splitting linear combinations
-        else:
-            split_obs = self._dp.getTotalDevices() if self._batch_obs else False
-        processed_data = self._process_jacobian_tape(
-            tape, starting_state, use_device_state, self._mpi, split_obs
-        )
+    # Device specific options
+    _CPP_BINARY_AVAILABLE = LGPU_CPP_BINARY_AVAILABLE
+    _backend_info = backend_info if LGPU_CPP_BINARY_AVAILABLE else None
 
-        if not processed_data:  # training_params is empty
-            return np.array([], dtype=self.state.dtype)
+    # This `config` is used in Catalyst-Frontend
+    config = Path(__file__).parent / "lightning_gpu.toml"
 
-        trainable_params = processed_data["tp_shift"]
-        # pylint: disable=pointless-string-statement
-        """
-        This path enables controlled batching over the requested observables, be they explicit, or part of a Hamiltonian.
-        The traditional path will assume there exists enough free memory to preallocate all arrays and run through each observable iteratively.
-        However, for larger system, this becomes impossible, and we hit memory issues very quickly. the batching support here enables several functionalities:
-        - Pre-allocate memory for all observables on the primary GPU (`batch_obs=False`, default behaviour): This is the simplest path, and works best for few observables, and moderate qubit sizes. All memory is preallocated for each observable, and run through iteratively on a single GPU.
-        - Evenly distribute the observables over all available GPUs (`batch_obs=True`): This will evenly split the data into ceil(num_obs/num_gpus) chunks, and allocate enough space on each GPU up-front before running through them concurrently. This relies on C++ threads to handle the orchestration.
-        - Allocate at most `n` observables per GPU (`batch_obs=n`): Providing an integer value restricts each available GPU to at most `n` copies of the statevector, and hence `n` given observables for a given batch. This will iterate over the data in chnuks of size `n*num_gpus`.
-        """
-        adjoint_jacobian = _adj_dtype(self.use_csingle, self._mpi)()
-
-        if self._batch_obs:  # Batching of Measurements
-            jac = adjoint_jacobian.batched(
-                self._gpu_state,
-                processed_data["obs_serialized"],
-                processed_data["ops_serialized"],
-                trainable_params,
-            )
-        else:
-            jac = adjoint_jacobian(
-                self._gpu_state,
-                processed_data["obs_serialized"],
-                processed_data["ops_serialized"],
-                trainable_params,
-            )
-        jac = np.array(jac)
-        has_shape0 = bool(len(jac))
+    # TODO: Move supported ops/obs to TOML file
+    operations = _operations
+    # The names of the supported operations.
 
-        num_obs = len(np.unique(processed_data["obs_indices"]))
-        rows = processed_data["obs_indices"]
-        cols = np.arange(len(rows), dtype=int)
-        data = np.ones(len(rows))
-        red_mat = csr_matrix((data, (rows, cols)), shape=(num_obs, len(rows)))
-        jac = red_mat @ jac.reshape((len(rows), -1))
-        jac = jac.reshape(-1, len(trainable_params)) if has_shape0 else jac
-        jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
-        jac_r[:, processed_data["record_tp_rows"]] = jac
-        return self._adjoint_jacobian_processing(jac_r)
+    observables = _observables
+    # The names of the supported observables.
 
-    # pylint: disable=inconsistent-return-statements, line-too-long, missing-function-docstring
-    def vjp(self, measurements, grad_vec, starting_state=None, use_device_state=False):
-        """Generate the processing function required to compute the vector-Jacobian products
-        of a tape.
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        wires: Union[int, List],
+        *,
+        c_dtype: Union[np.complex128, np.complex64] = np.complex128,
+        shots: Union[int, List] = None,
+        batch_obs: bool = False,
+        # GPU and MPI arguments
+        mpi: bool = False,
+        mpi_buf_size: int = 0,
+        use_async: bool = False,
+    ):
+        if not self._CPP_BINARY_AVAILABLE:
+            raise ImportError(
+                "Pre-compiled binaries for lightning.gpu are not available. "
+                "To manually compile from source, follow the instructions at "
+                "https://docs.pennylane.ai/projects/lightning/en/stable/dev/installation.html."
+            )
 
-        This function can be used with multiple expectation values or a quantum state.
-        When a quantum state is given,
+        check_gpu_resources()
 
-        .. code-block:: python
+        super().__init__(
+            wires=wires,
+            c_dtype=c_dtype,
+            shots=shots,
+            batch_obs=batch_obs,
+        )
 
-            vjp_f = dev.vjp([qml.state()], grad_vec)
-            vjp = vjp_f(tape)
+        # Set the attributes to call the LightningGPU classes
+        self._set_lightning_classes()
 
-        computes :math:`w = (w_1,\\cdots,w_m)` where
+        # GPU specific options
+        self._dp = DevPool()
+        self._use_async = use_async
 
-        .. math::
+        # Creating the state vector
+        self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype)
 
-            w_k = \\langle v| \\frac{\\partial}{\\partial \\theta_k} | \\psi_{\\pmb{\\theta}} \\rangle.
+        self._statevector = self.LightningStateVector(
+            num_wires=len(self.wires),
+            dtype=c_dtype,
+            mpi_handler=self._mpi_handler,
+            use_async=self._use_async,
+        )
 
-        Here, :math:`m` is the total number of trainable parameters,
-        :math:`\\pmb{\\theta}` is the vector of trainable parameters and
-        :math:`\\psi_{\\pmb{\\theta}}` is the output quantum state.
+    @property
+    def name(self):
+        """The name of the device."""
+        return "lightning.gpu"
 
-        Args:
-            measurements (list): List of measurement processes for vector-Jacobian product.
-                Now it must be expectation values or a quantum state.
-            grad_vec (tensor_like): Gradient-output vector. Must have shape matching the output
-                shape of the corresponding tape, i.e. number of measurements if the return
-                type is expectation or :math:`2^N` if the return type is statevector
-            starting_state (tensor_like): post-forward pass state to start execution with.
-                It should be complex-valued. Takes precedence over ``use_device_state``.
-            use_device_state (bool): use current device state to initialize.
-                A forward pass of the same circuit should be the last thing the device
-                has executed. If a ``starting_state`` is provided, that takes precedence.
+    def _set_lightning_classes(self):
+        """Load the LightningStateVector, LightningMeasurements, LightningAdjointJacobian as class attribute"""
+        self.LightningStateVector = LightningGPUStateVector
+        self.LightningMeasurements = LightningGPUMeasurements
+        self.LightningAdjointJacobian = LightningGPUAdjointJacobian
 
-        Returns:
-            The processing function required to compute the vector-Jacobian products of a tape.
+    def _setup_execution_config(self, config):
         """
-        if self.shots is not None:
-            warn(
-                "Requested adjoint differentiation to be computed with finite shots."
-                " The derivative is always exact when using the adjoint differentiation method.",
-                UserWarning,
-            )
-
-        tape_return_type = self._check_adjdiff_supported_measurements(measurements)
-
-        if math.allclose(grad_vec, 0) or tape_return_type is None:
-            return lambda tape: math.convert_like(np.zeros(len(tape.trainable_params)), grad_vec)
+        Update the execution config with choices for how the device should be used and the device options.
+        """
+        updated_values = {}
+        if config.gradient_method == "best":
+            updated_values["gradient_method"] = "adjoint"
+        if config.use_device_gradient is None:
+            updated_values["use_device_gradient"] = config.gradient_method in ("best", "adjoint")
+        if config.grad_on_execution is None:
+            updated_values["grad_on_execution"] = True
 
-        if tape_return_type is Expectation:
-            if len(grad_vec) != len(measurements):
-                raise ValueError(
-                    "Number of observables in the tape must be the same as the length of grad_vec in the vjp method"
-                )
+        new_device_options = dict(config.device_options)
+        for option in self._device_options:
+            if option not in new_device_options:
+                new_device_options[option] = getattr(self, f"_{option}", None)
 
-            if np.iscomplexobj(grad_vec):
-                raise ValueError(
-                    "The vjp method only works with a real-valued grad_vec when the tape is returning an expectation value"
-                )
+        # It is necessary to set the mcmc default configuration to complete the requirements of ExecuteConfig
+        mcmc_default = {"mcmc": False, "kernel_name": None, "num_burnin": 0, "rng": None}
+        new_device_options.update(mcmc_default)
 
-            ham = qml.Hamiltonian(grad_vec, [m.obs for m in measurements])
+        return replace(config, **updated_values, device_options=new_device_options)
 
-            # pylint: disable=protected-access
-            def processing_fn(tape):
-                nonlocal ham
-                num_params = len(tape.trainable_params)
+    def preprocess(self, execution_config: ExecutionConfig = DefaultExecutionConfig):
+        """This function defines the device transform program to be applied and an updated device configuration.
 
-                if num_params == 0:
-                    return np.array([], dtype=self.state.dtype)
+        Args:
+            execution_config (Union[ExecutionConfig, Sequence[ExecutionConfig]]): A data structure describing the
+                parameters needed to fully describe the execution.
 
-                new_tape = tape.copy()
-                new_tape._measurements = [qml.expval(ham)]
+        Returns:
+            TransformProgram, ExecutionConfig: A transform program that when called returns :class:`~.QuantumTape`'s that the
+            device can natively execute as well as a postprocessing function to be called after execution, and a configuration
+            with unset specifications filled in.
 
-                return self.adjoint_jacobian(new_tape, starting_state, use_device_state)
+        This device:
 
-            return processing_fn
+        * Supports any qubit operations that provide a matrix
+        * Currently does not support finite shots
+        * Currently does not intrinsically support parameter broadcasting
 
-    # pylint: disable=attribute-defined-outside-init
-    def sample(self, observable, shot_range=None, bin_size=None, counts=False):
-        """Return samples of an observable."""
-        diagonalizing_gates = observable.diagonalizing_gates()
-        if diagonalizing_gates:
-            self.apply(diagonalizing_gates)
-        if not isinstance(observable, qml.PauliZ):
-            self._samples = self.generate_samples()
-        results = super().sample(
-            observable, shot_range=shot_range, bin_size=bin_size, counts=counts
+        """
+        exec_config = self._setup_execution_config(execution_config)
+        program = TransformProgram()
+
+        program.add_transform(validate_measurements, name=self.name)
+        program.add_transform(validate_observables, accepted_observables, name=self.name)
+        program.add_transform(validate_device_wires, self.wires, name=self.name)
+        program.add_transform(
+            mid_circuit_measurements, device=self, mcm_config=exec_config.mcm_config
         )
-        if diagonalizing_gates:
-            self.apply([qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)])
-        return results
 
-    def generate_samples(self):
-        """Generate samples
-
-        Returns:
-            array[int]: array of samples in binary representation with shape
-            ``(dev.shots, dev.num_wires)``
-        """
-        shots = self.shots if isinstance(self.shots, int) else self.shots.total_shots
+        program.add_transform(
+            decompose,
+            stopping_condition=stopping_condition,
+            stopping_condition_shots=stopping_condition_shots,
+            skip_initial_state_prep=True,
+            name=self.name,
+        )
+        program.add_transform(qml.transforms.broadcast_expand)
 
-        return self.measurements.generate_samples(len(self.wires), shots).astype(int, copy=False)
+        if exec_config.gradient_method == "adjoint":
+            _add_adjoint_transforms(program)
+        return program, exec_config
 
-    # pylint: disable=protected-access
-    def expval(self, observable, shot_range=None, bin_size=None):
-        """Expectation value of the supplied observable.
+    # pylint: disable=unused-argument
+    def execute(
+        self,
+        circuits: QuantumTape_or_Batch,
+        execution_config: ExecutionConfig = DefaultExecutionConfig,
+    ) -> Result_or_ResultBatch:
+        """Execute a circuit or a batch of circuits and turn it into results.
 
         Args:
-            observable: A PennyLane observable.
-            shot_range (tuple[int]): 2-tuple of integers specifying the range of samples
-                to use. If not specified, all samples are used.
-            bin_size (int): Divides the shot range into bins of size ``bin_size``, and
-                returns the measurement statistic separately over each bin. If not
-                provided, the entire shot range is treated as a single bin.
+            circuits (Union[QuantumTape, Sequence[QuantumTape]]): the quantum circuits to be executed
+            execution_config (ExecutionConfig): a datastructure with additional information required for execution
 
         Returns:
-            Expectation value of the observable
+            TensorLike, tuple[TensorLike], tuple[tuple[TensorLike]]: A numeric result of the computation.
         """
-        if isinstance(observable, qml.Projector):
-            diagonalizing_gates = observable.diagonalizing_gates()
-            if self.shots is None and diagonalizing_gates:
-                self.apply(diagonalizing_gates)
-            results = super().expval(observable, shot_range=shot_range, bin_size=bin_size)
-            if self.shots is None and diagonalizing_gates:
-                self.apply([qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)])
-            return results
-
-        if self.shots is not None:
-            # estimate the expectation value
-            samples = self.sample(observable, shot_range=shot_range, bin_size=bin_size)
-            return np.squeeze(np.mean(samples, axis=0))
-
-        if isinstance(observable, qml.SparseHamiltonian):
-            if self._mpi:
-                # Identity for CSR_SparseHamiltonian to pass to processes with rank != 0 to reduce
-                # host(cpu) memory requirements
-                obs = qml.Identity(0)
-                Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
-                H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
-                CSR_SparseHamiltonian = H_sparse.sparse_matrix().tocsr()
-                # CSR_SparseHamiltonian for rank == 0
-                if self._mpi_manager.getRank() == 0:
-                    CSR_SparseHamiltonian = observable.sparse_matrix().tocsr()
-            else:
-                CSR_SparseHamiltonian = observable.sparse_matrix().tocsr()
-
-            return self.measurements.expval(
-                CSR_SparseHamiltonian.indptr,
-                CSR_SparseHamiltonian.indices,
-                CSR_SparseHamiltonian.data,
-            )
-
-        # use specialized functors to compute expval(Hermitian)
-        if isinstance(observable, qml.Hermitian):
-            observable_wires = self.map_wires(observable.wires)
-            if self._mpi and len(observable_wires) > self._num_local_wires:
-                raise RuntimeError(
-                    "MPI backend does not support Hermitian with number of target wires larger than local wire number."
+        results = []
+        for circuit in circuits:
+            if self._wire_map is not None:
+                [circuit], _ = qml.map_wires(circuit, self._wire_map)
+            results.append(
+                self.simulate(
+                    circuit,
+                    self._statevector,
+                    postselect_mode=execution_config.mcm_config.postselect_mode,
                 )
-            matrix = observable.matrix()
-            return self.measurements.expval(matrix, observable_wires)
-
-        if (
-            isinstance(observable, qml.ops.Hamiltonian)
-            or (observable.arithmetic_depth > 0)
-            or isinstance(observable.name, List)
-        ):
-            ob_serialized = QuantumScriptSerializer(
-                self.short_name, self.use_csingle, self._mpi
-            )._ob(observable, self.wire_map)
-            return self.measurements.expval(ob_serialized)
+            )
 
-        # translate to wire labels used by device
-        observable_wires = self.map_wires(observable.wires)
+        return tuple(results)
 
-        return self.measurements.expval(observable.name, observable_wires)
+    def supports_derivatives(
+        self,
+        execution_config: Optional[ExecutionConfig] = None,
+        circuit: Optional[qml.tape.QuantumTape] = None,
+    ) -> bool:
+        """Check whether or not derivatives are available for a given configuration and circuit.
 
-    def probability_lightning(self, wires=None):
-        """Return the probability of each computational basis state.
+        ``LightningGPU`` supports adjoint differentiation with analytic results.
 
         Args:
-            wires (Iterable[Number, str], Number, str, Wires): wires to return
-                marginal probabilities for. Wires not provided are traced out of the system.
+            execution_config (ExecutionConfig): The configuration of the desired derivative calculation
+            circuit (QuantumTape): An optional circuit to check derivatives support for.
 
         Returns:
-            array[float]: list of the probabilities
+            Bool: Whether or not a derivative can be calculated provided the given information
+
         """
-        # translate to wire labels used by device
-        observable_wires = self.map_wires(wires)
-        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
-        local_prob = self.measurements.probs(observable_wires)
-        if len(local_prob) > 0:
-            num_local_wires = len(local_prob).bit_length() - 1 if len(local_prob) > 0 else 0
-            return local_prob.reshape([2] * num_local_wires).transpose().reshape(-1)
-        return local_prob
-
-    def var(self, observable, shot_range=None, bin_size=None):
-        """Variance of the supplied observable.
+        if execution_config is None and circuit is None:
+            return True
+        if execution_config.gradient_method not in {"adjoint", "best"}:
+            return False
+        if circuit is None:
+            return True
+        return _supports_adjoint(circuit=circuit)
+
+    def simulate(
+        self,
+        circuit: QuantumScript,
+        state: LightningGPUStateVector,
+        postselect_mode: Optional[str] = None,
+    ) -> Result:
+        """Simulate a single quantum script.
 
         Args:
-            observable: A PennyLane observable.
-            shot_range (tuple[int]): 2-tuple of integers specifying the range of samples
-                to use. If not specified, all samples are used.
-            bin_size (int): Divides the shot range into bins of size ``bin_size``, and
-                returns the measurement statistic separately over each bin. If not
-                provided, the entire shot range is treated as a single bin.
+            circuit (QuantumTape): The single circuit to simulate
+            state (LightningGPUStateVector): handle to Lightning state vector
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots. Default is ``None``.
 
         Returns:
-            Variance of the observable
-        """
-        if isinstance(observable, qml.Projector):
-            diagonalizing_gates = observable.diagonalizing_gates()
-            if self.shots is None and diagonalizing_gates:
-                self.apply(diagonalizing_gates)
-            results = super().var(observable, shot_range=shot_range, bin_size=bin_size)
-            if self.shots is None and diagonalizing_gates:
-                self.apply([qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)])
-            return results
-
-        if self.shots is not None:
-            # estimate the var
-            # Lightning doesn't support sampling yet
-            samples = self.sample(observable, shot_range=shot_range, bin_size=bin_size)
-            return np.squeeze(np.var(samples, axis=0))
-
-        if isinstance(observable, qml.SparseHamiltonian):
-            csr_hamiltonian = observable.sparse_matrix(wire_order=self.wires).tocsr(copy=False)
-            return self.measurements.var(
-                csr_hamiltonian.indptr,
-                csr_hamiltonian.indices,
-                csr_hamiltonian.data,
-            )
+            Tuple[TensorLike]: The results of the simulation
 
-        if (
-            isinstance(observable, (qml.Hermitian, qml.ops.Hamiltonian))
-            or (observable.arithmetic_depth > 0)
-            or isinstance(observable.name, List)
-        ):
-            ob_serialized = QuantumScriptSerializer(
-                self.short_name, self.use_csingle, self._mpi
-            )._ob(observable, self.wire_map)
-            return self.measurements.var(ob_serialized)
+        Note that this function can return measurements for non-commuting observables simultaneously.
+        """
+        if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
+            if self._mpi_handler.use_mpi:
+                raise qml.DeviceError(
+                    "Lightning-GPU-MPI does not support Mid-circuit measurements."
+                )
 
-        # translate to wire labels used by device
-        observable_wires = self.map_wires(observable.wires)
+            results = []
+            aux_circ = QuantumScript(
+                circuit.operations,
+                circuit.measurements,
+                shots=[1],
+                trainable_params=circuit.trainable_params,
+            )
+            for _ in range(circuit.shots.total_shots):
+                state.reset_state()
+                mid_measurements = {}
+                final_state = state.get_final_state(
+                    aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode
+                )
+                results.append(
+                    self.LightningMeasurements(final_state).measure_final_state(
+                        aux_circ, mid_measurements=mid_measurements
+                    )
+                )
+            return tuple(results)
 
-        return self.measurements.var(observable.name, observable_wires)
+        state.reset_state()
+        final_state = state.get_final_state(circuit)
+        return self.LightningMeasurements(final_state).measure_final_state(circuit)
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.toml b/pennylane_lightning/lightning_gpu/lightning_gpu.toml
index 518315de09..b18470da6b 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.toml
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.toml
@@ -98,7 +98,7 @@ qjit_compatible = false
 # If the device requires run time generation of the quantum circuit.
 runtime_code_generation = false
 # If the device supports mid circuit measurements natively
-mid_circuit_measurement = false
+mid_circuit_measurement = true
 
 # This field is currently unchecked but it is reserved for the purpose of
 # determining if the device supports dynamic qubit allocation/deallocation.
diff --git a/pennylane_lightning/lightning_kokkos/_adjoint_jacobian.py b/pennylane_lightning/lightning_kokkos/_adjoint_jacobian.py
index 4338a5b876..bee481aac4 100644
--- a/pennylane_lightning/lightning_kokkos/_adjoint_jacobian.py
+++ b/pennylane_lightning/lightning_kokkos/_adjoint_jacobian.py
@@ -15,6 +15,10 @@
 Internal methods for adjoint Jacobian differentiation method.
 """
 
+from __future__ import annotations
+
+from warnings import warn
+
 try:
     from pennylane_lightning.lightning_kokkos_ops.algorithms import (
         AdjointJacobianC64,
@@ -22,8 +26,8 @@
         create_ops_listC64,
         create_ops_listC128,
     )
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 import numpy as np
 from pennylane.tape import QuantumTape
@@ -31,8 +35,6 @@
 # pylint: disable=ungrouped-imports
 from pennylane_lightning.core._adjoint_jacobian_base import LightningBaseAdjointJacobian
 
-from ._state_vector import LightningKokkosStateVector
-
 
 class LightningKokkosAdjointJacobian(LightningBaseAdjointJacobian):
     """Check and execute the adjoint Jacobian differentiation method.
@@ -44,7 +46,11 @@ class LightningKokkosAdjointJacobian(LightningBaseAdjointJacobian):
 
     # pylint: disable=too-few-public-methods
 
-    def __init__(self, qubit_state: LightningKokkosStateVector, batch_obs: bool = False) -> None:
+    def __init__(
+        self,
+        qubit_state: LightningKokkosStateVector,  # pylint: disable=undefined-variable
+        batch_obs: bool = False,
+    ) -> None:
         super().__init__(qubit_state, batch_obs)
 
         # Initialize the C++ binds
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index b438af350c..ee848739cf 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -15,11 +15,14 @@
 Class implementation for state vector measurements.
 """
 
-# pylint: disable=import-error, no-name-in-module, ungrouped-imports
+from __future__ import annotations
+
+from warnings import warn
+
 try:
     from pennylane_lightning.lightning_kokkos_ops import MeasurementsC64, MeasurementsC128
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 from typing import List
 
@@ -28,6 +31,7 @@
 from pennylane.measurements import CountsMP, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
+# pylint: disable=ungrouped-imports
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
 
 
@@ -44,7 +48,7 @@ class LightningKokkosMeasurements(
 
     def __init__(
         self,
-        kokkos_state,
+        kokkos_state: LightningKokkosStateVector,  # pylint: disable=undefined-variable
     ) -> None:
         super().__init__(kokkos_state)
 
diff --git a/pennylane_lightning/lightning_kokkos/_state_vector.py b/pennylane_lightning/lightning_kokkos/_state_vector.py
index 50518ed078..cd8d23ceef 100644
--- a/pennylane_lightning/lightning_kokkos/_state_vector.py
+++ b/pennylane_lightning/lightning_kokkos/_state_vector.py
@@ -14,6 +14,7 @@
 """
 Class implementation for lightning_kokkos state-vector manipulation.
 """
+from warnings import warn
 
 try:
     from pennylane_lightning.lightning_kokkos_ops import (
@@ -23,8 +24,10 @@
         allocate_aligned_array,
         print_configuration,
     )
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+from typing import Union
 
 import numpy as np
 import pennylane as qml
@@ -58,17 +61,16 @@ class LightningKokkosStateVector(LightningBaseStateVector):
 
     def __init__(
         self,
-        num_wires,
-        dtype=np.complex128,
+        num_wires: int,
+        dtype: Union[np.complex128, np.complex64] = np.complex128,
         kokkos_args=None,
-        sync=True,
-    ):  # pylint: disable=too-many-arguments
+    ):
+
         super().__init__(num_wires, dtype)
 
         self._device_name = "lightning.kokkos"
 
         self._kokkos_config = {}
-        self._sync = sync
 
         # Initialize the state vector
         if kokkos_args is None:
@@ -142,7 +144,7 @@ def sync_d2h(self, state_vector):
 
         >>> dev = qml.device('lightning.kokkos', wires=1)
         >>> dev.apply([qml.PauliX(wires=[0])])
-        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.C_DTYPE)
+        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.c_dtype)
         >>> dev.sync_d2h(state_vector)
         >>> print(state_vector)
         [0.+0.j 1.+0.j]
@@ -277,9 +279,12 @@ def _apply_lightning(
                 )
             elif isinstance(operation, qml.PauliRot):
                 method = getattr(state, "applyPauliRot")
-                paulis = operation._hyperparameters["pauli_word"]
+                # pylint: disable=protected-access
+                paulis = operation._hyperparameters[
+                    "pauli_word"
+                ]  # pylint: disable=protected-access
                 wires = [i for i, w in zip(wires, paulis) if w != "I"]
-                word = "".join(p for p in paulis if p != "I")  # pylint: disable=protected-access
+                word = "".join(p for p in paulis if p != "I")
                 method(wires, invert_param, operation.parameters, word)
             elif method is not None:  # apply specialized gate
                 param = operation.parameters
diff --git a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
index 668550ff24..faa7e6d0bf 100644
--- a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
+++ b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
@@ -20,7 +20,7 @@
 from dataclasses import replace
 from functools import reduce
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Union
 from warnings import warn
 
 import numpy as np
@@ -50,10 +50,6 @@
     Result_or_ResultBatch,
 )
 
-from ._adjoint_jacobian import LightningKokkosAdjointJacobian
-from ._measurements import LightningKokkosMeasurements
-from ._state_vector import LightningKokkosStateVector
-
 try:
     from pennylane_lightning.lightning_kokkos_ops import backend_info, print_configuration
 
@@ -63,6 +59,10 @@
     LK_CPP_BINARY_AVAILABLE = False
     backend_info = None
 
+from ._adjoint_jacobian import LightningKokkosAdjointJacobian
+from ._measurements import LightningKokkosMeasurements
+from ._state_vector import LightningKokkosStateVector
+
 # The set of supported operations.
 _operations = frozenset(
     {
@@ -313,13 +313,12 @@ class LightningKokkos(LightningBase):
 
     def __init__(  # pylint: disable=too-many-arguments
         self,
-        wires,
+        wires: Union[int, List],
         *,
-        c_dtype=np.complex128,
-        shots=None,
-        batch_obs=False,
+        c_dtype: Union[np.complex128, np.complex64] = np.complex128,
+        shots: Union[int, List] = None,
+        batch_obs: bool = False,
         # Kokkos arguments
-        sync=True,
         kokkos_args=None,
     ):
         if not self._CPP_BINARY_AVAILABLE:
@@ -341,11 +340,10 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # Kokkos specific options
         self._kokkos_args = kokkos_args
-        self._sync = sync
 
         # Creating the state vector
         self._statevector = self.LightningStateVector(
-            num_wires=len(self.wires), dtype=c_dtype, kokkos_args=kokkos_args, sync=sync
+            num_wires=len(self.wires), dtype=c_dtype, kokkos_args=kokkos_args
         )
 
         if not LightningKokkos.kokkos_config:
@@ -516,7 +514,7 @@ def simulate(
                     aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode
                 )
                 results.append(
-                    LightningKokkosMeasurements(final_state).measure_final_state(
+                    self.LightningMeasurements(final_state).measure_final_state(
                         aux_circ, mid_measurements=mid_measurements
                     )
                 )
@@ -524,7 +522,7 @@ def simulate(
 
         state.reset_state()
         final_state = state.get_final_state(circuit)
-        return LightningKokkosMeasurements(final_state).measure_final_state(circuit)
+        return self.LightningMeasurements(final_state).measure_final_state(circuit)
 
     @staticmethod
     def get_c_interface():
diff --git a/pennylane_lightning/lightning_qubit/_adjoint_jacobian.py b/pennylane_lightning/lightning_qubit/_adjoint_jacobian.py
index 0abc7f72f7..390c0cf69b 100644
--- a/pennylane_lightning/lightning_qubit/_adjoint_jacobian.py
+++ b/pennylane_lightning/lightning_qubit/_adjoint_jacobian.py
@@ -14,6 +14,9 @@
 r"""
 Internal methods for adjoint Jacobian differentiation method.
 """
+from __future__ import annotations
+
+from warnings import warn
 
 try:
     from pennylane_lightning.lightning_qubit_ops.algorithms import (
@@ -22,8 +25,8 @@
         create_ops_listC64,
         create_ops_listC128,
     )
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 from os import getenv
 
@@ -34,8 +37,6 @@
 # pylint: disable=ungrouped-imports
 from pennylane_lightning.core._adjoint_jacobian_base import LightningBaseAdjointJacobian
 
-from ._state_vector import LightningStateVector
-
 
 class LightningAdjointJacobian(
     LightningBaseAdjointJacobian
@@ -47,7 +48,12 @@ class LightningAdjointJacobian(
         batch_obs(bool): If serialized tape is to be batched or not.
     """
 
-    def __init__(self, qubit_state: LightningStateVector, batch_obs: bool = False) -> None:
+    def __init__(
+        self,
+        qubit_state: LightningStateVector,  # pylint: disable=undefined-variable
+        batch_obs: bool = False,
+    ) -> None:
+
         super().__init__(qubit_state, batch_obs)
 
         # Initialize the C++ binds
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index c1b97a1184..415ce74088 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -16,10 +16,14 @@
 """
 
 # pylint: disable=import-error, no-name-in-module, ungrouped-imports
+from __future__ import annotations
+
+from warnings import warn
+
 try:
     from pennylane_lightning.lightning_qubit_ops import MeasurementsC64, MeasurementsC128
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 from functools import reduce
 from typing import List
@@ -53,7 +57,7 @@ class LightningMeasurements(LightningBaseMeasurements):  # pylint: disable=too-f
 
     def __init__(
         self,
-        qubit_state,
+        qubit_state: LightningStateVector,  # pylint: disable=undefined-variable
         mcmc: bool = None,
         kernel_name: str = None,
         num_burnin: int = None,
diff --git a/pennylane_lightning/lightning_qubit/_state_vector.py b/pennylane_lightning/lightning_qubit/_state_vector.py
index b4b6ef5ff1..62068dcbd7 100644
--- a/pennylane_lightning/lightning_qubit/_state_vector.py
+++ b/pennylane_lightning/lightning_qubit/_state_vector.py
@@ -14,6 +14,7 @@
 """
 Class implementation for lightning_qubit state-vector manipulation.
 """
+from warnings import warn
 
 try:
     from pennylane_lightning.lightning_qubit_ops import (
@@ -21,8 +22,10 @@
         StateVectorC128,
         allocate_aligned_array,
     )
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+from typing import Union
 
 import numpy as np
 import pennylane as qml
@@ -50,7 +53,8 @@ class LightningStateVector(LightningBaseStateVector):  # pylint: disable=too-few
         device_name(string): state vector device name. Options: ["lightning.qubit"]
     """
 
-    def __init__(self, num_wires, dtype=np.complex128):
+    def __init__(self, num_wires: int, dtype: Union[np.complex128, np.complex64] = np.complex128):
+
         super().__init__(num_wires, dtype)
 
         self._device_name = "lightning.qubit"
diff --git a/pennylane_lightning/lightning_qubit/lightning_qubit.py b/pennylane_lightning/lightning_qubit/lightning_qubit.py
index c317bbfbad..abf0809787 100644
--- a/pennylane_lightning/lightning_qubit/lightning_qubit.py
+++ b/pennylane_lightning/lightning_qubit/lightning_qubit.py
@@ -18,7 +18,7 @@
 from dataclasses import replace
 from functools import reduce
 from pathlib import Path
-from typing import Optional, Sequence
+from typing import List, Optional, Sequence, Union
 from warnings import warn
 
 import numpy as np
@@ -48,10 +48,6 @@
     Result_or_ResultBatch,
 )
 
-from ._adjoint_jacobian import LightningAdjointJacobian
-from ._measurements import LightningMeasurements
-from ._state_vector import LightningStateVector
-
 try:
     from pennylane_lightning.lightning_qubit_ops import backend_info
 
@@ -60,6 +56,10 @@
     warn(str(ex), UserWarning)
     LQ_CPP_BINARY_AVAILABLE = False
 
+from ._adjoint_jacobian import LightningAdjointJacobian
+from ._measurements import LightningMeasurements
+from ._state_vector import LightningStateVector
+
 # The set of supported operations.
 _operations = frozenset(
     {
@@ -323,16 +323,16 @@ class LightningQubit(LightningBase):
 
     def __init__(  # pylint: disable=too-many-arguments
         self,
-        wires,
+        wires: Union[int, List],
         *,
-        c_dtype=np.complex128,
-        shots=None,
-        batch_obs=False,
+        c_dtype: Union[np.complex128, np.complex64] = np.complex128,
+        shots: Union[int, List] = None,
+        batch_obs: bool = False,
         # Markov Chain Monte Carlo (MCMC) sampling method arguments
-        seed="global",
-        mcmc=False,
-        kernel_name="Local",
-        num_burnin=100,
+        seed: Union[str, int] = "global",
+        mcmc: bool = False,
+        kernel_name: str = "Local",
+        num_burnin: int = 100,
     ):
         if not self._CPP_BINARY_AVAILABLE:
             raise ImportError(
@@ -559,4 +559,4 @@ def simulate(
 
         state.reset_state()
         final_state = state.get_final_state(circuit)
-        return LightningMeasurements(final_state, **mcmc).measure_final_state(circuit)
+        return self.LightningMeasurements(final_state, **mcmc).measure_final_state(circuit)
diff --git a/pennylane_lightning/lightning_tensor/_tensornet.py b/pennylane_lightning/lightning_tensor/_tensornet.py
index 05849ad4bb..967c0fbb17 100644
--- a/pennylane_lightning/lightning_tensor/_tensornet.py
+++ b/pennylane_lightning/lightning_tensor/_tensornet.py
@@ -21,8 +21,6 @@
 except ImportError:
     pass
 
-from itertools import product
-
 import numpy as np
 import pennylane as qml
 from pennylane import BasisState, DeviceError, StatePrep
@@ -223,20 +221,46 @@ def _preprocess_state_vector(self, state, device_wires):
         if len(device_wires) == self._num_wires and Wires(sorted(device_wires)) == device_wires:
             return np.reshape(state, output_shape).ravel(order="C")
 
-        # generate basis states on subset of qubits via the cartesian product
-        basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
+        local_dev_wires = device_wires.tolist().copy()
+        local_dev_wires = local_dev_wires[::-1]
+
+        # generate basis states on subset of qubits via broadcasting as substitute of cartesian product.
+
+        # Allocate a single row as a base to avoid a large array allocation with
+        # the cartesian product algorithm.
+        # Initialize the base with the pattern [0 1 0 1 ...].
+        base = np.tile([0, 1], 2 ** (len(local_dev_wires) - 1)).astype(dtype=np.int64)
+        # Allocate the array where it will accumulate the value of the indexes depending on
+        # the value of the basis.
+        indexes = np.zeros(2 ** (len(local_dev_wires)), dtype=np.int64)
+
+        max_dev_wire = self._num_wires - 1
+
+        # Iterate over all device wires.
+        for i, wire in enumerate(local_dev_wires):
+
+            # Accumulate indexes from the basis.
+            indexes += base * 2 ** (max_dev_wire - wire)
+
+            if i == len(local_dev_wires) - 1:
+                continue
+
+            two_n = 2 ** (i + 1)  # Compute the value of the base.
 
-        # get basis states to alter on full set of qubits
-        unravelled_indices = np.zeros((2 ** len(device_wires), self._num_wires), dtype=int)
-        unravelled_indices[:, device_wires] = basis_states
+            # Update the value of the base without reallocating a new array.
+            # Reshape the basis to swap the internal columns.
+            base = base.reshape(-1, two_n * 2)
+            swapper_A = two_n // 2
+            swapper_B = swapper_A + two_n
 
-        # get indices for which the state is changed to input state vector elements
-        ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self._num_wires)
+            base[:, swapper_A:swapper_B] = base[:, swapper_A:swapper_B][:, ::-1]
+            # Flatten the base array
+            base = base.reshape(-1)
 
         # get full state vector to be factorized into MPS
         full_state = np.zeros(2**self._num_wires, dtype=self.dtype)
         for i, value in enumerate(state):
-            full_state[ravelled_indices[i]] = value
+            full_state[indexes[i]] = value
         return np.reshape(full_state, output_shape).ravel(order="C")
 
     def _apply_state_vector(self, state, device_wires: Wires):
@@ -285,7 +309,7 @@ def _apply_MPO(self, gate_matrix, wires):
             None
         """
         # TODO: Discuss if public interface for max_mpo_bond_dim argument
-        max_mpo_bond_dim = 2 ** len(wires)  # Exact SVD decomposition for MPO
+        max_mpo_bond_dim = self._max_bond_dim
 
         # Get sorted wires and MPO site tensor
         mpos, sorted_wires = gate_matrix_decompose(
diff --git a/setup.py b/setup.py
index a326a90a2d..7e6e080e96 100644
--- a/setup.py
+++ b/setup.py
@@ -156,6 +156,12 @@ def build_extension(self, ext: CMakeExtension):
             env=os.environ,
         )
 
+        # Ensure that catalyst shared object is copied to the build directory for pip editable install
+        if backend in ("lightning_kokkos"):
+            source = os.path.join(f"{extdir}", f"lib{backend}_catalyst.so")
+            destination = os.path.join(os.getcwd(), "build")
+            shutil.copy(source, destination)
+
 with open(os.path.join("pennylane_lightning", "core", "_version.py"), encoding="utf-8") as f:
     version = f.readlines()[-1].split()[-1].strip("\"'")
 
diff --git a/tests/conftest.py b/tests/conftest.py
index a648418465..1c06ae0dc4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -142,10 +142,15 @@ def get_device():
         from pennylane_lightning.lightning_kokkos_ops import LightningException
 elif device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
-
-    LightningAdjointJacobian = None
-    LightningMeasurements = None
-    LightningStateVector = None
+    from pennylane_lightning.lightning_gpu._adjoint_jacobian import (
+        LightningGPUAdjointJacobian as LightningAdjointJacobian,
+    )
+    from pennylane_lightning.lightning_gpu._measurements import (
+        LightningGPUMeasurements as LightningMeasurements,
+    )
+    from pennylane_lightning.lightning_gpu._state_vector import (
+        LightningGPUStateVector as LightningStateVector,
+    )
 
     if hasattr(pennylane_lightning, "lightning_gpu_ops"):
         import pennylane_lightning.lightning_gpu_ops as lightning_ops
diff --git a/tests/lightning_qubit/test_adjoint_jacobian_class.py b/tests/lightning_qubit/test_adjoint_jacobian_class.py
index 6a6c5b8e92..74fc8e2427 100644
--- a/tests/lightning_qubit/test_adjoint_jacobian_class.py
+++ b/tests/lightning_qubit/test_adjoint_jacobian_class.py
@@ -481,6 +481,7 @@ def test_hermitian_expectation(self, tol, lightning_sv):
             tape.trainable_params = {0}
 
             statevector.reset_state()
+
             vjp = self.calculate_vjp(statevector, tape, dy)
 
             assert np.allclose(vjp, -0.8 * np.sin(x), atol=tol)
@@ -498,6 +499,7 @@ def test_hermitian_tensor_expectation(self, tol, lightning_sv):
             tape.trainable_params = {0}
 
             statevector.reset_state()
+
             vjp = self.calculate_vjp(statevector, tape, dy)
 
             assert np.allclose(vjp, -0.8 * np.sin(x), atol=tol)
diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index c1b6fb44d5..5ddb6a5f5b 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -669,8 +669,8 @@ def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_s
             assert np.allclose(r, e, atol=dtol, rtol=dtol)
 
     @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not support out of order probs.",
+        device_name in ("lightning.tensor"),
+        reason=f"{device_name} does not support out of order probs.",
     )
     @pytest.mark.parametrize(
         "cases",
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index 3918afcd5f..b3baaa3ea6 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -30,6 +30,9 @@
     except ImportError:
         pass
 
+if device_name == "lightning.gpu":
+    from pennylane_lightning.lightning_gpu._mpi_handler import MPIHandler
+
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 
@@ -39,6 +42,7 @@
         allow_module_level=True,
     )
 
+
 if not LightningDevice._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
@@ -86,10 +90,18 @@ def test_apply_state_vector_with_lightning_handle(tol):
     state_vector_1 = LightningStateVector(2)
     state_vector_1.apply_operations([qml.BasisState(np.array([0, 1]), wires=[0, 1])])
 
-    state_vector_2 = LightningStateVector(2)
-    state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
+    if device_name == "lightning.gpu":
+        with pytest.raises(
+            qml.DeviceError, match="LightningGPU does not support allocate external state_vector."
+        ):
+            state_vector_2 = LightningStateVector(2)
+            state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
+
+    else:
+        state_vector_2 = LightningStateVector(2)
+        state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
 
-    assert np.allclose(state_vector_1.state, state_vector_2.state, atol=tol, rtol=0)
+        assert np.allclose(state_vector_1.state, state_vector_2.state, atol=tol, rtol=0)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/new_api/test_device.py b/tests/new_api/test_device.py
index 0485f3a054..111dd3af7d 100644
--- a/tests/new_api/test_device.py
+++ b/tests/new_api/test_device.py
@@ -43,8 +43,7 @@
         validate_measurements,
         validate_observables,
     )
-
-if device_name == "lightning.kokkos":
+elif device_name == "lightning.kokkos":
     from pennylane_lightning.lightning_kokkos.lightning_kokkos import (
         _add_adjoint_transforms,
         _adjoint_ops,
@@ -62,13 +61,31 @@
         validate_measurements,
         validate_observables,
     )
-
-
-if device_name == "lightning.tensor":
+elif device_name == "lightning.gpu":
+    from pennylane_lightning.lightning_gpu.lightning_gpu import (
+        _add_adjoint_transforms,
+        _adjoint_ops,
+        _supports_adjoint,
+        accepted_observables,
+        adjoint_measurements,
+        adjoint_observables,
+        decompose,
+        mid_circuit_measurements,
+        no_sampling,
+        stopping_condition,
+        stopping_condition_shots,
+        validate_adjoint_trainable_params,
+        validate_device_wires,
+        validate_measurements,
+        validate_observables,
+    )
+elif device_name == "lightning.tensor":
     from pennylane_lightning.lightning_tensor.lightning_tensor import (
         accepted_observables,
         stopping_condition,
     )
+else:
+    raise TypeError(f"The device name: {device_name} is not a valid name")
 
 if not LightningDevice._new_API:
     pytest.skip("Exclusive tests for new device API. Skipping.", allow_module_level=True)
@@ -448,6 +465,11 @@ def test_execute_single_measurement(self, theta, phi, mp, dev):
         if isinstance(mp.obs, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             mp.obs = qml.operation.convert_to_legacy_H(mp.obs)
 
+        if isinstance(mp.obs, qml.SparseHamiltonian) and dev.dtype == np.complex64:
+            pytest.skip(
+                reason="The conversion from qml.Hamiltonian to SparseHamiltonian is only possible with np.complex128"
+            )
+
         qs = QuantumScript(
             [
                 qml.RX(phi, 0),
@@ -641,6 +663,12 @@ def test_supports_derivatives(self, dev, config, tape, expected, batch_obs):
             qml.Z(1) + qml.X(1),
             qml.Hamiltonian([-1.0, 1.5], [qml.Z(1), qml.X(1)]),
             qml.Hermitian(qml.Hadamard.compute_matrix(), 0),
+            qml.SparseHamiltonian(
+                qml.Hamiltonian([-1.0, 1.5], [qml.Z(1), qml.X(1)]).sparse_matrix(
+                    wire_order=[0, 1, 2]
+                ),
+                wires=[0, 1, 2],
+            ),
             qml.Projector([1], 1),
         ],
     )
@@ -649,6 +677,11 @@ def test_derivatives_single_expval(
         self, theta, phi, dev, obs, execute_and_derivatives, batch_obs
     ):
         """Test that the jacobian is correct when a tape has a single expectation value"""
+        if isinstance(obs, qml.SparseHamiltonian) and dev.dtype == np.complex64:
+            pytest.skip(
+                reason="The conversion from qml.Hamiltonian to SparseHamiltonian is only possible with np.complex128"
+            )
+
         if isinstance(obs, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             obs = qml.operation.convert_to_legacy_H(obs)
 
@@ -705,6 +738,11 @@ def test_derivatives_multi_expval(
         self, theta, phi, omega, dev, obs1, obs2, execute_and_derivatives, batch_obs
     ):
         """Test that the jacobian is correct when a tape has multiple expectation values"""
+        if isinstance(obs2, qml.SparseHamiltonian) and dev.dtype == np.complex64:
+            pytest.skip(
+                reason="The conversion from qml.Hamiltonian to SparseHamiltonian is only possible with np.complex128"
+            )
+
         if isinstance(obs1, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             obs1 = qml.operation.convert_to_legacy_H(obs1)
         if isinstance(obs2, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
@@ -1074,6 +1112,11 @@ def test_vjp_multi_expval(
         self, theta, phi, omega, dev, obs1, obs2, execute_and_derivatives, batch_obs
     ):
         """Test that the VJP is correct when a tape has multiple expectation values"""
+        if isinstance(obs2, qml.SparseHamiltonian) and dev.dtype == np.complex64:
+            pytest.skip(
+                reason="The conversion from qml.Hamiltonian to SparseHamiltonian is only possible with np.complex128"
+            )
+
         if isinstance(obs1, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             obs1 = qml.operation.convert_to_legacy_H(obs1)
         if isinstance(obs2, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
diff --git a/tests/test_measurements.py b/tests/test_measurements.py
index 211a8c134b..6cb008f12f 100644
--- a/tests/test_measurements.py
+++ b/tests/test_measurements.py
@@ -151,8 +151,8 @@ def circuit():
             _ = circuit()
 
     @pytest.mark.skipif(
-        device_name in ("lightning.gpu", "lightning.tensor"),
-        reason="lightning.gpu/lightning.tensor does not support out of order prob.",
+        device_name in ("lightning.tensor"),
+        reason="lightning.tensor does not support out of order prob.",
     )
     @pytest.mark.parametrize(
         "cases",
diff --git a/tests/test_native_mcm.py b/tests/test_native_mcm.py
index 07281fb48a..050e1d27c6 100644
--- a/tests/test_native_mcm.py
+++ b/tests/test_native_mcm.py
@@ -21,7 +21,7 @@
 from conftest import LightningDevice, device_name, validate_measurements
 from flaky import flaky
 
-if device_name not in ("lightning.qubit", "lightning.kokkos"):
+if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"):
     pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True)
 
 if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access
@@ -89,7 +89,7 @@ def func(x, y):
             match=f"not accepted with finite shots on lightning.qubit",
         ):
             func(*params)
-    if device_name == "lightning.kokkos":
+    if device_name in ("lightning.kokkos", "lightning.gpu"):
         with pytest.raises(
             qml.DeviceError,
             match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on "
diff --git a/tests/test_var.py b/tests/test_var.py
index 4b4e8561fa..7bdcec2c20 100644
--- a/tests/test_var.py
+++ b/tests/test_var.py
@@ -24,7 +24,6 @@
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
-
 np.random.seed(42)