From c6f3f3f75aeedf6ce9c43dac50a0b1f08c8a6282 Mon Sep 17 00:00:00 2001
From: Shuli Shu <31480676+multiphaseCFD@users.noreply.github.com>
Date: Mon, 7 Oct 2024 20:03:08 -0400
Subject: [PATCH] Add native `setStateVector` support to `lightning.gpu` (#930)

### Before submitting

Please complete the following checklist when submitting a PR:

- [ ] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [ ] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [x] Ensure that the test suite passes, by running `make test`.

- [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [x] Ensure that code is properly formatted by running `make format`.

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**
 [SC-74668]
`setStateVector` via state and wires is supported in the C++ layer in
`lightning.gpu`

**Description of the Change:**

**Benefits:**

**Possible Drawbacks:**

**Related GitHub Issues:**

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
---
 .github/CHANGELOG.md                          |   3 +
 pennylane_lightning/core/_version.py          |   2 +-
 .../lightning_gpu/StateVectorCudaMPI.hpp      | 156 ++++++++++++------
 .../lightning_gpu/StateVectorCudaManaged.hpp  | 101 ++++++++----
 .../lightning_gpu/bindings/LGPUBindings.hpp   |  25 +--
 .../bindings/LGPUBindingsMPI.hpp              |  25 +--
 .../Test_StateVectorCudaManaged_NonParam.cpp  |  63 +------
 .../mpi/Test_StateVectorCudaMPI_NonParam.cpp  |  21 +--
 .../lightning_gpu/_state_vector.py            |  29 +---
 .../lightning_gpu/lightning_gpu.py            |  11 +-
 10 files changed, 222 insertions(+), 214 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 6ac7b1fb2..27ecd62a7 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -43,6 +43,9 @@
 
 ### Improvements
 
+* Add `setStateVector(state, wire)` support to the `lightning.gpu` C++ layer.
+  [(#930)](https://github.com/PennyLaneAI/pennylane-lightning/pull/930)
+
 * Add zero-state initialization to both `StateVectorCudaManaged` and `StateVectorCudaMPI` constructors to remove the `reset_state` in the python layer ctor and refactor `setBasisState(state, wires)` in the C++ layer.
   [(#933)](https://github.com/PennyLaneAI/pennylane-lightning/pull/933)
   
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 4f9f65022..c6e0b5050 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev39"
+__version__ = "0.39.0-dev40"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 577c510f9..964c5e69c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -289,61 +289,48 @@ class StateVectorCudaMPI final
         BaseType::getDataBuffer().zeroInit();
         setBasisState_(value, index, use_async);
     }
+
     /**
-     * @brief Set values for a batch of elements of the state-vector. This
-     * method is implemented by the customized CUDA kernel defined in the
-     * DataBuffer class.
+     * @brief Set values for a batch of elements of the state-vector.
      *
-     * @param num_indices Number of elements to be passed to the state vector.
-     * @param values Pointer to values to be set for the target elements.
-     * @param indices Pointer to indices of the target elements.
-     * @param async Use an asynchronous memory copy.
+     * @param state_ptr Pointer to initial state data.
+     * @param num_states Length of initial state data.
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy. Default is false.
      */
-    template <class index_type, std::size_t thread_per_block = 256>
-    void setStateVector(const index_type num_indices,
-                        const std::complex<Precision> *values,
-                        const index_type *indices, const bool async = false) {
-        BaseType::getDataBuffer().zeroInit();
-
-        std::vector<index_type> indices_local;
-        std::vector<std::complex<Precision>> values_local;
-
-        for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
-             i++) {
-            int index = indices[i];
-            PL_ASSERT(index >= 0);
-            std::size_t rankId =
-                static_cast<std::size_t>(index) >> BaseType::getNumQubits();
-
-            if (rankId == mpi_manager_.getRank()) {
-                int local_index = static_cast<int>(
-                    compute_local_index(static_cast<std::size_t>(index),
-                                        this->getNumLocalQubits()));
-                indices_local.push_back(local_index);
-                values_local.push_back(values[i]);
+    void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
+                        const std::vector<std::size_t> &wires,
+                        bool use_async = false) {
+        PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
+                        "Inconsistent state and wires dimensions.");
+
+        const auto num_qubits = this->getTotalNumQubits();
+
+        PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
+                                     [&num_qubits](const auto i) {
+                                         return i >= num_qubits;
+                                     }) == wires.end(),
+                        "Invalid wire index.");
+
+        using index_type =
+            typename std::conditional<std::is_same<PrecisionT, float>::value,
+                                      int32_t, int64_t>::type;
+
+        // Calculate the indices of the state-vector to be set.
+        // TODO: Could move to GPU/MPI calculation if the state size is large.
+        std::vector<index_type> indices(num_states);
+        const std::size_t num_wires = wires.size();
+        constexpr std::size_t one{1U};
+        for (std::size_t i = 0; i < num_states; i++) {
+            std::size_t index{0U};
+            for (std::size_t j = 0; j < num_wires; j++) {
+                const std::size_t bit = (i & (one << j)) >> j;
+                index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
             }
+            indices[i] = static_cast<index_type>(index);
         }
-
-        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-
-        index_type num_elements = indices_local.size();
-
-        DataBuffer<index_type, int> d_indices{
-            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
-
-        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
-                                        device_id, stream_id, true};
-
-        d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
-                                    async);
-        d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
-                                   async);
-
-        setStateVector_CUDA(BaseType::getData(), num_elements,
-                            d_values.getData(), d_indices.getData(),
-                            thread_per_block, stream_id);
-        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        setStateVector_<index_type>(num_states, state_ptr, indices.data(),
+                                    use_async);
         mpi_manager_.Barrier();
     }
 
@@ -1548,6 +1535,62 @@ class StateVectorCudaMPI final
         return t_indices;
     }
 
+    /**
+     * @brief Set values for a batch of elements of the state-vector. This
+     * method is implemented by the customized CUDA kernel defined in the
+     * DataBuffer class.
+     *
+     * @param num_indices Number of elements to be passed to the state vector.
+     * @param values Pointer to values to be set for the target elements.
+     * @param indices Pointer to indices of the target elements.
+     * @param async Use an asynchronous memory copy.
+     */
+    template <class index_type, std::size_t thread_per_block = 256>
+    void setStateVector_(const index_type num_indices,
+                         const std::complex<Precision> *values,
+                         const index_type *indices, const bool async = false) {
+        BaseType::getDataBuffer().zeroInit();
+
+        std::vector<index_type> indices_local;
+        std::vector<std::complex<Precision>> values_local;
+
+        for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
+             i++) {
+            int index = indices[i];
+            PL_ASSERT(index >= 0);
+            std::size_t rankId =
+                static_cast<std::size_t>(index) >> BaseType::getNumQubits();
+
+            if (rankId == mpi_manager_.getRank()) {
+                int local_index = static_cast<int>(
+                    compute_local_index(static_cast<std::size_t>(index),
+                                        this->getNumLocalQubits()));
+                indices_local.push_back(local_index);
+                values_local.push_back(values[i]);
+            }
+        }
+
+        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+
+        index_type num_elements = indices_local.size();
+
+        DataBuffer<index_type, int> d_indices{
+            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
+
+        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
+                                        device_id, stream_id, true};
+
+        d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
+                                    async);
+        d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
+                                   async);
+
+        setStateVector_CUDA(BaseType::getData(), num_elements,
+                            d_values.getData(), d_indices.getData(),
+                            thread_per_block, stream_id);
+    }
+
     /**
      * @brief Set value for a single element of the state-vector on device. This
      * method is implemented by cudaMemcpy.
@@ -1637,8 +1680,8 @@ class StateVectorCudaMPI final
     }
 
     /**
-     * @brief Apply parametric Pauli gates to local statevector using custateVec
-     * calls.
+     * @brief Apply parametric Pauli gates to local statevector using
+     * custateVec calls.
      *
      * @param pauli_words List of Pauli words representing operation.
      * @param ctrls Control wires
@@ -1708,7 +1751,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
@@ -1868,7 +1912,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
@@ -2009,7 +2054,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index a4e40f1d2..9e68592df 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -215,36 +215,46 @@ class StateVectorCudaManaged
     }
 
     /**
-     * @brief Set values for a batch of elements of the state-vector. This
-     * method is implemented by the customized CUDA kernel defined in the
-     * DataBuffer class.
+     * @brief Set values for a batch of elements of the state-vector.
      *
-     * @param num_indices Number of elements to be passed to the state vector.
-     * @param values Pointer to values to be set for the target elements.
-     * @param indices Pointer to indices of the target elements.
-     * @param async Use an asynchronous memory copy.
+     * @param state_ptr Pointer to the initial state data.
+     * @param num_states Length of the initial state data.
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy. Default is false.
      */
-    template <class index_type, std::size_t thread_per_block = 256>
-    void setStateVector(const index_type num_indices,
-                        const std::complex<Precision> *values,
-                        const index_type *indices, const bool async = false) {
-        BaseType::getDataBuffer().zeroInit();
-
-        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-
-        index_type num_elements = num_indices;
-        DataBuffer<index_type, int> d_indices{
-            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
-        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
-                                        device_id, stream_id, true};
+    void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
+                        const std::vector<std::size_t> &wires,
+                        bool use_async = false) {
+        PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
+                        "Inconsistent state and wires dimensions.");
 
-        d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
-        d_values.CopyHostDataToGpu(values, d_values.getLength(), async);
+        const auto num_qubits = BaseType::getNumQubits();
 
-        setStateVector_CUDA(BaseType::getData(), num_elements,
-                            d_values.getData(), d_indices.getData(),
-                            thread_per_block, stream_id);
+        PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
+                                     [&num_qubits](const auto i) {
+                                         return i >= num_qubits;
+                                     }) == wires.end(),
+                        "Invalid wire index.");
+
+        using index_type =
+            typename std::conditional<std::is_same<PrecisionT, float>::value,
+                                      int32_t, int64_t>::type;
+
+        // Calculate the indices of the state-vector to be set.
+        // TODO: Could move to GPU calculation if the state size is large.
+        std::vector<index_type> indices(num_states);
+        const std::size_t num_wires = wires.size();
+        constexpr std::size_t one{1U};
+        for (std::size_t i = 0; i < num_states; i++) {
+            std::size_t index{0U};
+            for (std::size_t j = 0; j < num_wires; j++) {
+                const std::size_t bit = (i & (one << j)) >> j;
+                index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
+            }
+            indices[i] = static_cast<index_type>(index);
+        }
+        setStateVector_<index_type>(num_states, state_ptr, indices.data(),
+                                    use_async);
     }
 
     /**
@@ -1346,9 +1356,8 @@ class StateVectorCudaManaged
         return t_indices;
     }
 
-    /**
-     * @brief Set value for a single element of the state-vector on device. This
-     * method is implemented by cudaMemcpy.
+    /** @brief Set value for a single element of the state-vector on device.
+     * This method is implemented by cudaMemcpy.
      *
      * @param value Value to be set for the target element.
      * @param index Index of the target element.
@@ -1362,6 +1371,40 @@ class StateVectorCudaManaged
                            stream_id);
     }
 
+    /**
+     * @brief Set values for a batch of elements of the state-vector. This
+     * method is implemented by the customized CUDA kernel defined in the
+     * DataBuffer class.
+     *
+     * @param num_indices Number of elements to be passed to the state vector.
+     * @param values Pointer to values to be set for the target elements.
+     * @param indices Pointer to indices of the target elements.
+     * @param async Use an asynchronous memory copy.
+     */
+    template <class index_type, std::size_t thread_per_block = 256>
+    void setStateVector_(const index_type num_indices,
+                         const std::complex<Precision> *values,
+                         const index_type *indices, const bool async = false) {
+        BaseType::getDataBuffer().zeroInit();
+
+        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+
+        index_type num_elements = num_indices;
+        DataBuffer<index_type, int> d_indices{
+            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
+        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
+                                        device_id, stream_id, true};
+
+        d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
+        d_values.CopyHostDataToGpu(values, d_values.getLength(), async);
+
+        setStateVector_CUDA(BaseType::getData(), num_elements,
+                            d_values.getData(), d_indices.getData(),
+                            thread_per_block, stream_id);
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+    }
+
     /**
      * @brief Apply parametric Pauli gates using custateVec calls.
      *
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index b2d03eba5..c361bd6ed 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -63,10 +63,6 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
     using ParamT = PrecisionT;        // Parameter's data precision
     using np_arr_c = py::array_t<std::complex<ParamT>,
                                  py::array::c_style | py::array::forcecast>;
-    using np_arr_sparse_ind = typename std::conditional<
-        std::is_same<ParamT, float>::value,
-        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
-        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
 
     registerGatesForStateVector<StateVectorT>(pyclass);
 
@@ -91,20 +87,15 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
             "Set the state vector to a basis state on GPU.")
         .def(
             "setStateVector",
-            [](StateVectorT &sv, const np_arr_sparse_ind &indices,
-               const np_arr_c &state, const bool use_async) {
-                using index_type = typename std::conditional<
-                    std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
-
-                sv.template setStateVector<index_type>(
-                    static_cast<index_type>(indices.request().size),
-                    static_cast<std::complex<PrecisionT> *>(
-                        state.request().ptr),
-                    static_cast<index_type *>(indices.request().ptr),
-                    use_async);
+            [](StateVectorT &sv, const np_arr_c &state,
+               const std::vector<std::size_t> &wires, const bool async) {
+                const auto state_buffer = state.request();
+                const auto state_ptr =
+                    static_cast<const std::complex<ParamT> *>(state_buffer.ptr);
+                sv.setStateVector(state_ptr, state_buffer.size, wires, async);
             },
-            "Set State Vector on GPU with values and their corresponding "
-            "indices for the state vector on device")
+            "Set State Vector on GPU with values for the state vector and "
+            "wires on the host memory.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index f6a933aca..2d3313f69 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -63,10 +63,6 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
     using ParamT = PrecisionT;        // Parameter's data precision
     using np_arr_c = py::array_t<std::complex<ParamT>,
                                  py::array::c_style | py::array::forcecast>;
-    using np_arr_sparse_ind = typename std::conditional<
-        std::is_same<ParamT, float>::value,
-        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
-        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
 
     registerGatesForStateVector<StateVectorT>(pyclass);
 
@@ -95,20 +91,15 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             "Set the state vector to a basis state on GPU.")
         .def(
             "setStateVector",
-            [](StateVectorT &sv, const np_arr_sparse_ind &indices,
-               const np_arr_c &state, const bool use_async) {
-                using index_type = typename std::conditional<
-                    std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
-
-                sv.template setStateVector<index_type>(
-                    static_cast<index_type>(indices.request().size),
-                    static_cast<std::complex<PrecisionT> *>(
-                        state.request().ptr),
-                    static_cast<index_type *>(indices.request().ptr),
-                    use_async);
+            [](StateVectorT &sv, const np_arr_c &state,
+               const std::vector<std::size_t> &wires, const bool async) {
+                const auto state_buffer = state.request();
+                const auto state_ptr =
+                    static_cast<const std::complex<ParamT> *>(state_buffer.ptr);
+                sv.setStateVector(state_ptr, state_buffer.size, wires, async);
             },
-            "Set State Vector on GPU with values and their corresponding "
-            "indices for the state vector on device")
+            "Set State Vector on GPU with values for the state vector and "
+            "wires on the host memory.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
index dc0c1a712..af864d8b0 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
@@ -1069,68 +1069,15 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVector",
         }
 
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.CopyHostDataToGpu(init_state.data(), init_state.size());
-
-        using index_type =
-            typename std::conditional<std::is_same<PrecisionT, float>::value,
-                                      int32_t, int64_t>::type;
-        // The setStates will shuffle the state vector values on the device with
-        // the following indices and values setting on host. For example, the
-        // values[i] is used to set the indices[i] th element of state vector on
-        // the device. For example, values[2] (init_state[5]) will be copied to
-        // indices[2]th or (4th) element of the state vector.
-        std::vector<index_type> indices = {0, 2, 4, 6, 1, 3, 5, 7};
-
-        std::vector<std::complex<PrecisionT>> values = {
-            init_state[1], init_state[3], init_state[5], init_state[7],
-            init_state[0], init_state[2], init_state[4], init_state[6]};
-
-        sv.template setStateVector<index_type>(values.size(), values.data(),
-                                               indices.data(), false);
 
-        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
-    }
-}
-// LCOV_EXCL_START
-TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVectorwith_thread_setting",
-                   "[StateVectorCudaManaged_Nonparam]", float, double) {
-    using PrecisionT = TestType;
-    const std::size_t num_qubits = 3;
-    std::mt19937 re{1337};
-
-    SECTION("SetStates with a non-default GPU thread setting") {
-        auto init_state =
-            createRandomStateVectorData<PrecisionT>(re, num_qubits);
-        auto expected_state = init_state;
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
 
-        for (std::size_t i = 0; i < Pennylane::Util::exp2(num_qubits - 1);
-             i++) {
-            std::swap(expected_state[i * 2], expected_state[i * 2 + 1]);
-        }
-
-        StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.CopyHostDataToGpu(init_state.data(), init_state.size());
-
-        using index_type =
-            typename std::conditional<std::is_same<PrecisionT, float>::value,
-                                      int32_t, int64_t>::type;
-
-        std::vector<index_type> indices = {0, 2, 4, 6, 1, 3, 5, 7};
-
-        std::vector<std::complex<PrecisionT>> values = {
-            init_state[1], init_state[3], init_state[5], init_state[7],
-            init_state[0], init_state[2], init_state[4], init_state[6]};
-
-        // default setting of the number of threads in a block is 256.
-        const std::size_t threads_per_block = 1024;
-
-        sv.template setStateVector<index_type, threads_per_block>(
-            values.size(), values.data(), indices.data(), false);
-
-        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{0, 1, 2});
+        CHECK(init_state == Pennylane::Util::approx(sv.getDataVector()));
     }
 }
-// LCOV_EXCL_STOP
 
 TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetIthStates",
                    "[StateVectorCudaManaged_Nonparam]", float, double) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
index b9ed7fcbe..968badd4d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
@@ -15,6 +15,7 @@
 #include <complex>
 #include <iostream>
 #include <limits>
+#include <numeric>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -158,21 +159,17 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::SetStateVector",
             "the host") {
         StateVectorCudaMPI<PrecisionT> sv(mpi_manager, dt_local, mpi_buffersize,
                                           nGlobalIndexBits, nLocalIndexBits);
-        // The setStates will shuffle the state vector values on the device with
-        // the following indices and values setting on host. For example, the
-        // values[i] is used to set the indices[i] th element of state vector on
-        // the device. For example, values[2] (init_state[5]) will be copied to
-        // indices[2]th or (4th) element of the state vector.
 
-        sv.template setStateVector<index_type>(
-            init_state.size(), init_state.data(), indices.data(), false);
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+        std::vector<std::size_t> wires(num_qubits);
+        std::iota(wires.begin(), wires.end(), 0);
+        sv.setStateVector(values.data(), values.size(), wires);
 
-        mpi_manager.Barrier();
-        sv.CopyGpuDataToHost(local_state.data(),
-                             static_cast<std::size_t>(subSvLength));
-        mpi_manager.Barrier();
+        auto expected_local_state_vector = mpi_manager.scatter<cp_t>(values, 0);
 
-        CHECK(expected_local_state == Pennylane::Util::approx(local_state));
+        CHECK(expected_local_state_vector ==
+              Pennylane::Util::approx(sv.getDataVector()));
     }
 }
 
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index faca301e8..a00044356 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -31,7 +31,6 @@
 except ImportError as ex:
     warn(str(ex), UserWarning)
 
-from itertools import product
 from typing import Union
 
 import numpy as np
@@ -69,7 +68,7 @@ class LightningGPUStateVector(LightningBaseStateVector):
         device_name(string): state vector device name. Options: ["lightning.gpu"]
         mpi_handler(MPIHandler): MPI handler for PennyLane Lightning GPU device.
             Provides functionality to distribute the state-vector to multiple devices.
-        sync (bool): is host-device data copy synchronized or not.
+        use_async (bool): is host-device data copy asynchronized or not.
     """
 
     def __init__(
@@ -77,7 +76,7 @@ def __init__(
         num_wires: int,
         dtype: Union[np.complex128, np.complex64] = np.complex128,
         mpi_handler: MPIHandler = None,
-        sync: bool = True,
+        use_async: bool = False,
     ):
 
         super().__init__(num_wires, dtype)
@@ -92,7 +91,7 @@ def __init__(
         self._num_local_wires = mpi_handler.num_local_wires
 
         self._mpi_handler = mpi_handler
-        self._sync = sync
+        self._use_async = use_async
 
         # Initialize the state vector
         if self._mpi_handler.use_mpi:  # using MPI
@@ -120,7 +119,7 @@ def _state_dtype(self):
         # without MPI
         return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64
 
-    def syncD2H(self, state_vector, use_async=False):
+    def syncD2H(self, state_vector, use_async: bool = False):
         """Copy the state vector data on device to a state vector on the host provided by the user.
         Args:
             state_vector(array[complex]): the state vector array on host.
@@ -155,7 +154,7 @@ def state(self):
         self.syncD2H(state)
         return state
 
-    def syncH2D(self, state_vector, use_async=False):
+    def syncH2D(self, state_vector, use_async: bool = False):
         """Copy the state vector data on host provided by the user to the state vector on the device
         Args:
             state_vector(array[complex]): the state vector array on host.
@@ -189,7 +188,7 @@ def _asarray(arr, dtype=None):
 
         return arr
 
-    def _apply_state_vector(self, state, device_wires, use_async=False):
+    def _apply_state_vector(self, state, device_wires, use_async: bool = False):
         """Initialize the state vector on GPU with a specified state on host.
         Note that any use of this method will introduce host-overheads.
         Args:
@@ -224,20 +223,8 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
             self.syncH2D(np.reshape(local_state, output_shape))
             return
 
-        # generate basis states on subset of qubits via the cartesian product
-        basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
-
-        # get basis states to alter on full set of qubits
-        unravelled_indices = np.zeros((2 ** len(device_wires), self.num_wires), dtype=int)
-        unravelled_indices[:, device_wires] = basis_states
-
-        # get indices for which the state is changed to input state vector elements
-        ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires)
-
-        # set the state vector on GPU with the unravelled_indices and their corresponding values
-        self._qubit_state.setStateVector(
-            ravelled_indices, state, use_async
-        )  # this operation on device
+        # set the state vector on GPU with provided state and their corresponding wires
+        self._qubit_state.setStateVector(state, list(device_wires), use_async)
 
     def _apply_lightning_controlled(self, operation):
         """Apply an arbitrary controlled operation to the state tensor.
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 84d7dd31e..2b295c499 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -296,7 +296,7 @@ class LightningGPU(LightningBase):
             is built with MPI. Default is False.
         mpi (bool): declare if the device will use the MPI support.
         mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
-        sync (bool): is host-device data copy synchronized or not.
+        use_async (bool): is host-device data copy asynchronized or not.
     """
 
     # General device options
@@ -326,7 +326,7 @@ def __init__(  # pylint: disable=too-many-arguments
         # GPU and MPI arguments
         mpi: bool = False,
         mpi_buf_size: int = 0,
-        sync: bool = False,
+        use_async: bool = False,
     ):
         if not self._CPP_BINARY_AVAILABLE:
             raise ImportError(
@@ -349,13 +349,16 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # GPU specific options
         self._dp = DevPool()
-        self._sync = sync
+        self._use_async = use_async
 
         # Creating the state vector
         self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype)
 
         self._statevector = self.LightningStateVector(
-            num_wires=len(self.wires), dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
+            num_wires=len(self.wires),
+            dtype=c_dtype,
+            mpi_handler=self._mpi_handler,
+            use_async=self._use_async,
         )
 
     @property