From 02eb2904438d09c988a270381d2a6315b113442b Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 17 Oct 2024 18:29:55 +0000
Subject: [PATCH] initial commit

---
 .../lightning_gpu/StateVectorCudaMPI.hpp      |   9 +
 .../lightning_gpu/StateVectorCudaManaged.hpp  | 184 ++++++++----------
 .../lightning_gpu/bindings/LGPUBindings.hpp   |  11 ++
 .../Test_StateVectorCudaManaged_Param.cpp     |   9 +-
 .../mpi/Test_StateVectorCudaMPI_Param.cpp     |  13 ++
 .../src/simulators/lightning_gpu/initSV.cu    | 132 +------------
 .../tests/Test_StateVectorCudaManaged.cpp     |   9 +-
 .../lightning_gpu/_state_vector.py            |  10 +-
 8 files changed, 134 insertions(+), 243 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 964c5e69c..f9c0f0116 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -386,6 +386,15 @@ class StateVectorCudaMPI final
 
         if (opName == "Identity") {
             return;
+        } else if (opName == "GlobalPhase") {
+            PrecisionT param = adjoint ? -params[0] : params[0];
+            CFP_t scale_factor{std::cos(param), -std::sin(param)};
+            scaleC_CUDA<CFP_t, CFP_t, int>(
+                scale_factor, BaseType::getDataBuffer().getData(),
+                BaseType::getDataBuffer().getLength(),
+                BaseType::getDataBuffer().getDevTag().getDeviceID(),
+                BaseType::getDataBuffer().getDevTag().getStreamID(),
+                getCublasCaller());
         } else if (native_gates_.find(opName) != native_gates_.end()) {
             applyParametricPauliGate({opName}, ctrls, tgts, params.front(),
                                      adjoint);
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index f5aeb4abb..2e31bbe2f 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -67,25 +67,6 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const std::size_t index, bool async,
                                cudaStream_t stream_id);
 
-extern void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
-                                        cuComplex phase,
-                                        std::size_t thread_per_block,
-                                        cudaStream_t stream_id);
-extern void globalPhaseStateVector_CUDA(cuDoubleComplex *sv, std::size_t num_sv,
-                                        cuDoubleComplex phase,
-                                        std::size_t thread_per_block,
-                                        cudaStream_t stream_id);
-
-extern void cGlobalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
-                                         bool adjoint, cuComplex *phase,
-                                         std::size_t thread_per_block,
-                                         cudaStream_t stream_id);
-extern void cGlobalPhaseStateVector_CUDA(cuDoubleComplex *sv,
-                                         std::size_t num_sv, bool adjoint,
-                                         cuDoubleComplex *phase,
-                                         std::size_t thread_per_block,
-                                         cudaStream_t stream_id);
-
 /**
  * @brief Managed memory CUDA state-vector class using custateVec backed
  * gate-calls.
@@ -257,44 +238,6 @@ class StateVectorCudaManaged
                                     use_async);
     }
 
-    /**
-     * @brief Multiplies the state-vector by a global phase.
-     *
-     * @param adjoint Indicates whether to use adjoint of gate.
-     * @param param Complex phase generator.
-     */
-    template <std::size_t thread_per_block = 256>
-    void globalPhaseStateVector(const bool adjoint, const Precision param) {
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-        std::complex<Precision> phase =
-            std::exp(std::complex<Precision>{0, (adjoint) ? param : -param});
-        auto cuPhase = complexToCu(phase);
-        globalPhaseStateVector_CUDA(BaseType::getData(), BaseType::getLength(),
-                                    cuPhase, thread_per_block, stream_id);
-    }
-
-    /**
-     * @brief Multiplies the state-vector by a controlled global phase.
-     *
-     * @param phase Controlled complex phase vector.
-     */
-    template <std::size_t thread_per_block = 256>
-    void cGlobalPhaseStateVector(const bool adjoint,
-                                 const std::vector<CFP_t> &phase,
-                                 const bool async = false) {
-        PL_ABORT_IF_NOT(BaseType::getLength() == phase.size(),
-                        "The state-vector data must have the same size as the "
-                        "controlled-phase data.")
-        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-        DataBuffer<CFP_t, int> d_phase{phase.size(), device_id, stream_id,
-                                       true};
-        d_phase.CopyHostDataToGpu(phase.data(), d_phase.getLength(), async);
-        cGlobalPhaseStateVector_CUDA(BaseType::getData(), BaseType::getLength(),
-                                     adjoint, d_phase.getData(),
-                                     thread_per_block, stream_id);
-    }
-
     /**
      * @brief Apply a single gate to the state-vector. Offloads to custatevec
      * specific API calls if available. If unable, attempts to use prior cached
@@ -347,13 +290,18 @@ class StateVectorCudaManaged
                                             wires.end()};
         if (opName == "Identity") {
             return;
-        } else if (opName == "C(GlobalPhase)") {
-            cGlobalPhaseStateVector(adjoint, gate_matrix);
         } else if (opName == "GlobalPhase") {
-            globalPhaseStateVector(adjoint, params[0]);
+            PrecisionT param = adjoint ? -params[0] : params[0];
+            CFP_t scale_factor{std::cos(param), -std::sin(param)};
+            scaleC_CUDA<CFP_t, CFP_t, int>(
+                scale_factor, BaseType::getDataBuffer().getData(),
+                BaseType::getDataBuffer().getLength(),
+                BaseType::getDataBuffer().getDevTag().getDeviceID(),
+                BaseType::getDataBuffer().getDevTag().getStreamID(),
+                getCublasCaller());
         } else if (native_gates_.find(opName) != native_gates_.end()) {
-            applyParametricPauliGate({opName}, ctrls, tgts, params.front(),
-                                     adjoint);
+            applyParametricPauliGate_({opName}, ctrls, tgts, params.front(),
+                                      adjoint);
         } else if (opName == "Rot" || opName == "CRot") {
             if (adjoint) {
                 auto rot_matrix =
@@ -407,25 +355,44 @@ class StateVectorCudaManaged
      * @param opName Name of gate to apply.
      * @param controlled_wires Control wires.
      * @param controlled_values Control values (false or true).
-     * @param wires Wires to apply gate to.
-     * @param inverse Indicates whether to use adjoint of gate.
+     * @param tgt_wires Wires to apply gate to.
+     * @param adjoint Indicates whether to use adjoint of gate.
      * @param params Optional parameter list for parametric gates.
      * @param params Optional std gate matrix if opName doesn't exist.
      */
-    template <template <typename...> class complex_t>
-    void
-    applyOperation(const std::string &opName,
-                   const std::vector<std::size_t> &controlled_wires,
-                   const std::vector<bool> &controlled_values,
-                   const std::vector<std::size_t> &wires, bool inverse = false,
-                   const std::vector<Precision> &params = {0.0},
-                   const std::vector<complex_t<Precision>> &gate_matrix = {}) {
-        PL_ABORT_IF_NOT(controlled_wires.empty(),
-                        "Controlled kernels not implemented.");
-        PL_ABORT_IF_NOT(controlled_wires.size() == controlled_values.size(),
-                        "`controlled_wires` must have the same size as "
-                        "`controlled_values`.");
-        applyOperation(opName, wires, inverse, params, gate_matrix);
+    void applyOperation(
+        const std::string &opName, const std::vector<std::size_t> &ctrl_wires,
+        const std::vector<bool> &ctrl_values,
+        const std::vector<std::size_t> &tgt_wires, bool adjoint = false,
+        const std::vector<Precision> &params = {0.0},
+        [[maybe_unused]] const std::vector<ComplexT> &gate_matrix = {}) {
+        PL_ABORT_IF_NOT(opName == "GlobalPhase",
+                        "Only GlobalPhase gate is supported.");
+        PL_ABORT_IF(ctrl_wires.size() != ctrl_values.size(),
+                    "`ctrls` and `ctrls_values` must have the same size.");
+        std::vector<int> ctrlsInt(ctrl_wires.size());
+        std::vector<int> tgtsInt(tgt_wires.size());
+        std::vector<int> ctrls_valuesInt(ctrl_wires.size());
+
+        std::transform(ctrl_wires.begin(), ctrl_wires.end(), ctrlsInt.begin(),
+                       [&](std::size_t x) {
+                           return static_cast<int>(BaseType::getNumQubits() -
+                                                   1 - x);
+                       });
+        std::transform(tgt_wires.begin(), tgt_wires.end(), tgtsInt.begin(),
+                       [&](std::size_t x) {
+                           return static_cast<int>(BaseType::getNumQubits() -
+                                                   1 - x);
+                       });
+
+        std::transform(ctrl_values.begin(), ctrl_values.end(),
+                       ctrls_valuesInt.begin(),
+                       [&](bool x) { return static_cast<int>(x); });
+        if (opName == "GlobalPhase") {
+            const std::vector<std::string> names(tgt_wires.size(), "I");
+            applyParametricPauliGeneralGate_(names, ctrlsInt, ctrls_valuesInt,
+                                             tgtsInt, 2 * params[0], adjoint);
+        }
     }
 
     /**
@@ -595,20 +562,20 @@ class StateVectorCudaManaged
     inline void applyRX(const std::vector<std::size_t> &wires, bool adjoint,
                         Precision param) {
         static const std::vector<std::string> name{{"RX"}};
-        applyParametricPauliGate(name, {wires.begin(), wires.end() - 1},
-                                 {wires.back()}, param, adjoint);
+        applyParametricPauliGate_(name, {wires.begin(), wires.end() - 1},
+                                  {wires.back()}, param, adjoint);
     }
     inline void applyRY(const std::vector<std::size_t> &wires, bool adjoint,
                         Precision param) {
         static const std::vector<std::string> name{{"RY"}};
-        applyParametricPauliGate(name, {wires.begin(), wires.end() - 1},
-                                 {wires.back()}, param, adjoint);
+        applyParametricPauliGate_(name, {wires.begin(), wires.end() - 1},
+                                  {wires.back()}, param, adjoint);
     }
     inline void applyRZ(const std::vector<std::size_t> &wires, bool adjoint,
                         Precision param) {
         static const std::vector<std::string> name{{"RZ"}};
-        applyParametricPauliGate(name, {wires.begin(), wires.end() - 1},
-                                 {wires.back()}, param, adjoint);
+        applyParametricPauliGate_(name, {wires.begin(), wires.end() - 1},
+                                  {wires.back()}, param, adjoint);
     }
     inline void applyRot(const std::vector<std::size_t> &wires, bool adjoint,
                          Precision param0, Precision param1, Precision param2) {
@@ -664,17 +631,17 @@ class StateVectorCudaManaged
     inline void applyIsingXX(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
         static const std::vector<std::string> names(wires.size(), {"RX"});
-        applyParametricPauliGate(names, {}, wires, param, adjoint);
+        applyParametricPauliGate_(names, {}, wires, param, adjoint);
     }
     inline void applyIsingYY(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
         static const std::vector<std::string> names(wires.size(), {"RY"});
-        applyParametricPauliGate(names, {}, wires, param, adjoint);
+        applyParametricPauliGate_(names, {}, wires, param, adjoint);
     }
     inline void applyIsingZZ(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
         static const std::vector<std::string> names(wires.size(), {"RZ"});
-        applyParametricPauliGate(names, {}, wires, param, adjoint);
+        applyParametricPauliGate_(names, {}, wires, param, adjoint);
     }
     inline void applyIsingXY(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
@@ -789,7 +756,7 @@ class StateVectorCudaManaged
     inline void applyMultiRZ(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
         const std::vector<std::string> names(wires.size(), {"RZ"});
-        applyParametricPauliGate(names, {}, wires, param, adjoint);
+        applyParametricPauliGate_(names, {}, wires, param, adjoint);
     }
 
     /* Gate generators */
@@ -1464,12 +1431,10 @@ class StateVectorCudaManaged
      * @param tgts target wires.
      * @param use_adjoint Take adjoint of operation.
      */
-    void applyParametricPauliGate(const std::vector<std::string> &pauli_words,
-                                  std::vector<std::size_t> ctrls,
-                                  std::vector<std::size_t> tgts,
-                                  Precision param, bool use_adjoint = false) {
-        int nIndexBits = BaseType::getNumQubits();
-
+    void applyParametricPauliGate_(const std::vector<std::string> &pauli_words,
+                                   std::vector<std::size_t> ctrls,
+                                   std::vector<std::size_t> tgts,
+                                   Precision param, bool use_adjoint = false) {
         std::vector<int> ctrlsInt(ctrls.size());
         std::vector<int> tgtsInt(tgts.size());
 
@@ -1483,6 +1448,29 @@ class StateVectorCudaManaged
                 return static_cast<int>(BaseType::getNumQubits() - 1 - x);
             });
 
+        const std::vector<int> ctrls_valuesInt(ctrls.size(), 1);
+
+        applyParametricPauliGeneralGate_(pauli_words, ctrlsInt, ctrls_valuesInt,
+                                         tgtsInt, param, use_adjoint);
+    }
+
+    /**
+     * @brief Apply a parametric Pauli gate using custateVec calls.
+     *
+     * @param pauli_words List of Pauli words representing operation.
+     * @param ctrls Control wires
+     * @param ctrls_values Control values
+     * @param tgts target wires.
+     * @param param Rotation angle.
+     * @param use_adjoint Take adjoint of operation.
+     */
+    void applyParametricPauliGeneralGate_(
+        const std::vector<std::string> &pauli_words,
+        const std::vector<int> &ctrlsInt,
+        const std::vector<int> &ctrls_valuesInt, const std::vector<int> tgtsInt,
+        Precision param, bool use_adjoint = false) {
+        int nIndexBits = BaseType::getNumQubits();
+
         cudaDataType_t data_type;
 
         if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
@@ -1491,14 +1479,12 @@ class StateVectorCudaManaged
         } else {
             data_type = CUDA_C_32F;
         }
-
         std::vector<custatevecPauli_t> pauli_enums;
         pauli_enums.reserve(pauli_words.size());
         for (const auto &pauli_str : pauli_words) {
             pauli_enums.push_back(native_gates_.at(pauli_str));
         }
         const auto local_angle = (use_adjoint) ? param / 2 : -param / 2;
-
         PL_CUSTATEVEC_IS_SUCCESS(custatevecApplyPauliRotation(
             /* custatevecHandle_t */ handle_.get(),
             /* void* */ BaseType::getData(),
@@ -1507,10 +1493,10 @@ class StateVectorCudaManaged
             /* double */ local_angle,
             /* const custatevecPauli_t* */ pauli_enums.data(),
             /* const int32_t* */ tgtsInt.data(),
-            /* const uint32_t */ tgts.size(),
+            /* const uint32_t */ tgtsInt.size(),
             /* const int32_t* */ ctrlsInt.data(),
-            /* const int32_t* */ nullptr,
-            /* const uint32_t */ ctrls.size()));
+            /* const int32_t* */ ctrls_valuesInt.data(),
+            /* const uint32_t */ ctrlsInt.size()));
         PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
             BaseType::getDataBuffer().getDevTag().getStreamID()));
     }
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index 145097b30..a755f61d8 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -152,6 +152,17 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
             "Initialize the statevector data to the |0...0> state")
         .def("collapse", &StateVectorT::collapse,
              "Collapse the statevector onto the 0 or 1 branch of a given wire.")
+        .def(
+            "apply",
+            [](StateVectorT &sv, const std::string &gate_name,
+               const std::vector<std::size_t> &controlled_wires,
+               const std::vector<bool> &controlled_values,
+               const std::vector<std::size_t> &wires, bool inverse,
+               const std::vector<ParamT> &params) {
+                sv.applyOperation(gate_name, controlled_wires,
+                                  controlled_values, wires, inverse, params);
+            },
+            "Apply operation via the gate matrix")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Param.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Param.cpp
index e2485910d..6a0de4efd 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Param.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Param.cpp
@@ -1564,7 +1564,6 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyControlledGlobalPhase",
     std::mt19937_64 re{1337};
     const std::size_t num_qubits = 3;
     const bool inverse = GENERATE(false, true);
-    const std::size_t index = GENERATE(0, 1, 2);
     /* The `phase` array contains the diagonal entries of the controlled-phase
        operator. It can be created in Python using the following command
 
@@ -1582,7 +1581,11 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyControlledGlobalPhase",
     auto sv_data = createRandomStateVectorData<TestType>(re, num_qubits);
     StateVectorCudaManaged<TestType> sv(
         reinterpret_cast<ComplexT *>(sv_data.data()), sv_data.size());
-    sv.applyOperation("C(GlobalPhase)", {index}, inverse, {}, phase);
+    std::vector<std::size_t> ctrls = {0, 1};
+    std::vector<bool> ctrl_vals = {0, 1};
+    std::vector<std::size_t> tgts = {2};
+    const TestType param = -M_PI_2;
+    sv.applyOperation("GlobalPhase", ctrls, ctrl_vals, tgts, inverse, {param});
     auto result_sv = sv.getDataVector();
     for (std::size_t j = 0; j < exp2(num_qubits); j++) {
         ComplexT tmp = (inverse) ? conj(phase[j]) : phase[j];
@@ -1590,4 +1593,4 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::applyControlledGlobalPhase",
         CHECK((real(result_sv[j])) == Approx(real(tmp)));
         CHECK((imag(result_sv[j])) == Approx(imag(tmp)));
     }
-}
\ No newline at end of file
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_Param.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_Param.cpp
index 17cf43e84..329b9f754 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_Param.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_Param.cpp
@@ -356,6 +356,19 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::DoubleExcitationPlus",
         msb_4qbit, angle_1param);
 }
 
+TEMPLATE_TEST_CASE("StateVectorCudaMPI::GlobalPhase",
+                   "[StateVectorCudaMPI_Param]", float, double) {
+    PLGPU_MPI_TEST_GATE_OPS_PARAM(TestType, num_qubits,
+                                  applyDoubleExcitationPlus, "GlobalPhase",
+                                  lsb_4qbit, angle_1param);
+    PLGPU_MPI_TEST_GATE_OPS_PARAM(TestType, num_qubits,
+                                  applyDoubleExcitationPlus, "GlobalPhase",
+                                  mlsb_4qbit, angle_1param);
+    PLGPU_MPI_TEST_GATE_OPS_PARAM(TestType, num_qubits,
+                                  applyDoubleExcitationPlus, "GlobalPhase",
+                                  msb_4qbit, angle_1param);
+}
+
 TEMPLATE_TEST_CASE("LightningGPUMPI:applyOperation", "[LightningGPUMPI_Param]",
                    float, double) {
     using StateVectorT = StateVectorCudaMPI<TestType>;
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
index 8a62e89e8..10f56e6a8 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
@@ -106,96 +106,6 @@ void setStateVector_CUDA_call(GPUDataT *sv, index_type &num_indices,
     PL_CUDA_IS_SUCCESS(cudaGetLastError());
 }
 
-/**
- * @brief The CUDA kernel that multiplies the state vector data on GPU device
- * by a global phase.
- *
- * @param sv Complex data pointer of state vector on device.
- * @param num_sv Number of state vector elements.
- * @param phase Complex data pointer of input values (on device).
- */
-template <class GPUDataT, class index_type>
-__global__ void globalPhaseStateVectorkernel(GPUDataT *sv, index_type num_sv,
-                                             GPUDataT phase) {
-    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < num_sv) {
-        sv[i] = Cmul(sv[i], phase);
-    }
-}
-
-/**
- * @brief The CUDA kernel call wrapper.
- *
- * @param sv Complex data pointer of state vector on device.
- * @param num_sv Number of state vector elements.
- * @param phase Constant complex phase.
- * @param thread_per_block Number of threads set per block.
- * @param stream_id Stream id of CUDA calls
- */
-template <class GPUDataT, class index_type>
-void globalPhaseStateVector_CUDA_call(GPUDataT *sv, index_type num_sv,
-                                      GPUDataT phase,
-                                      std::size_t thread_per_block,
-                                      cudaStream_t stream_id) {
-    auto dv = std::div(static_cast<long>(num_sv), thread_per_block);
-    std::size_t num_blocks = dv.quot + (dv.rem == 0 ? 0 : 1);
-    const std::size_t block_per_grid = (num_blocks == 0 ? 1 : num_blocks);
-    dim3 blockSize(thread_per_block, 1, 1);
-    dim3 gridSize(block_per_grid, 1);
-
-    globalPhaseStateVectorkernel<GPUDataT, index_type>
-        <<<gridSize, blockSize, 0, stream_id>>>(sv, num_sv, phase);
-    PL_CUDA_IS_SUCCESS(cudaGetLastError());
-}
-
-/**
- * @brief The CUDA kernel that multiplies the state vector data on GPU device
- * by a controlled global phase.
- *
- * @param sv Complex data pointer of state vector on device.
- * @param num_sv Number of state vector elements.
- * @param phase Complex data pointer of controlled global phase values (on
- * device).
- */
-template <class GPUDataT, class index_type, bool adjoint = false>
-__global__ void cGlobalPhaseStateVectorkernel(GPUDataT *sv, index_type num_sv,
-                                              GPUDataT *phase) {
-    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < num_sv) {
-        if constexpr (adjoint) {
-            sv[i] = Cmul(sv[i], Conj(phase[i]));
-        } else {
-            sv[i] = Cmul(sv[i], phase[i]);
-        }
-    }
-}
-
-/**
- * @brief The CUDA kernel call wrapper.
- *
- * @param sv Complex data pointer of state vector on device.
- * @param num_sv Number of state vector elements.
- * @param phase Complex data pointer of controlled global phase values (on
- * device).
- * @param thread_per_block Number of threads set per block.
- * @param stream_id Stream id of CUDA calls
- */
-template <class GPUDataT, class index_type, bool adjoint = false>
-void cGlobalPhaseStateVector_CUDA_call(GPUDataT *sv, index_type num_sv,
-                                       GPUDataT *phase,
-                                       std::size_t thread_per_block,
-                                       cudaStream_t stream_id) {
-    auto dv = std::div(static_cast<long>(num_sv), thread_per_block);
-    std::size_t num_blocks = dv.quot + (dv.rem == 0 ? 0 : 1);
-    const std::size_t block_per_grid = (num_blocks == 0 ? 1 : num_blocks);
-    dim3 blockSize(thread_per_block, 1, 1);
-    dim3 gridSize(block_per_grid, 1);
-
-    cGlobalPhaseStateVectorkernel<GPUDataT, index_type, adjoint>
-        <<<gridSize, blockSize, 0, stream_id>>>(sv, num_sv, phase);
-    PL_CUDA_IS_SUCCESS(cudaGetLastError());
-}
-
 /**
  * @brief CUDA runtime API call wrapper.
  *
@@ -242,44 +152,4 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                         cudaStream_t stream_id) {
     setBasisState_CUDA_call(sv, value, index, async, stream_id);
 }
-
-void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
-                                 cuComplex phase, std::size_t thread_per_block,
-                                 cudaStream_t stream_id) {
-    globalPhaseStateVector_CUDA_call(sv, num_sv, phase, thread_per_block,
-                                     stream_id);
-}
-void globalPhaseStateVector_CUDA(cuDoubleComplex *sv, std::size_t num_sv,
-                                 cuDoubleComplex phase,
-                                 std::size_t thread_per_block,
-                                 cudaStream_t stream_id) {
-    globalPhaseStateVector_CUDA_call(sv, num_sv, phase, thread_per_block,
-                                     stream_id);
-}
-
-void cGlobalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
-                                  bool adjoint, cuComplex *phase,
-                                  std::size_t thread_per_block,
-                                  cudaStream_t stream_id) {
-    if (adjoint) {
-        cGlobalPhaseStateVector_CUDA_call<cuComplex, std::size_t, true>(
-            sv, num_sv, phase, thread_per_block, stream_id);
-    } else {
-        cGlobalPhaseStateVector_CUDA_call<cuComplex, std::size_t, false>(
-            sv, num_sv, phase, thread_per_block, stream_id);
-    }
-}
-void cGlobalPhaseStateVector_CUDA(cuDoubleComplex *sv, std::size_t num_sv,
-                                  bool adjoint, cuDoubleComplex *phase,
-                                  std::size_t thread_per_block,
-                                  cudaStream_t stream_id) {
-    if (adjoint) {
-        cGlobalPhaseStateVector_CUDA_call<cuDoubleComplex, std::size_t, true>(
-            sv, num_sv, phase, thread_per_block, stream_id);
-    } else {
-        cGlobalPhaseStateVector_CUDA_call<cuDoubleComplex, std::size_t, false>(
-            sv, num_sv, phase, thread_per_block, stream_id);
-    }
-}
-
-} // namespace Pennylane::LightningGPU
\ No newline at end of file
+} // namespace Pennylane::LightningGPU
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
index 841074474..6d2f17358 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
@@ -194,14 +194,15 @@ TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaManaged::applyOperations",
                                         std::vector<std::size_t>{1}, false,
                                         {0.0}, std::vector<ComplexT>{}),
             LightningException,
-            "Controlled kernels not implemented."); // invalid controlled_wires
+            "Only GlobalPhase gate is supported."); // invalid controlled_wires
         PL_REQUIRE_THROWS_MATCHES(
-            state_vector.applyOperation("PauliX", {}, std::vector<bool>{false},
+            state_vector.applyOperation("GlobalPhase", {},
+                                        std::vector<bool>{false},
                                         std::vector<std::size_t>{1}, false,
                                         {0.0}, std::vector<ComplexT>{}),
             LightningException,
-            "`controlled_wires` must have the same size "
-            "as"); // invalid controlled_wires
+            "`ctrls` and `ctrls_values` must have the "
+            "same size."); // invalid controlled_wires
     }
 
     SECTION("Test invalid arguments with parameters") {
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 77e453778..a3787d40f 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -43,7 +43,6 @@
 from pennylane.wires import Wires
 
 # pylint: disable=ungrouped-imports
-from pennylane_lightning.core._serialize import global_phase_diagonal
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
 
 from ._measurements import LightningGPUMeasurements
@@ -241,15 +240,14 @@ def _apply_lightning_controlled(self, operation):
         """
         state = self.state_vector
 
+        basename = operation.base.name
         control_wires = list(operation.control_wires)
         control_values = operation.control_values
-        name = operation.name
+        target_wires = list(operation.target_wires)
         # Apply GlobalPhase
         inv = False
-        param = operation.parameters[0]
-        wires = self.wires.indices(operation.wires)
-        matrix = global_phase_diagonal(param, self.wires, control_wires, control_values)
-        state.apply(name, wires, inv, [[param]], matrix)
+        param = operation.parameters
+        state.apply(basename, control_wires, control_values, target_wires, inv, param)
 
     def _apply_lightning_midmeasure(
         self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str