Add CUDA-12 capability (#477)

* M pennylane_lightning/core/src/bindings/Bindings.hpp; hack `JacobianData` to work with devices. M pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp; `applyMatrix` bugfix: use intermediate hostview to copy matrix data; same bugfix for `getDataVector`. M pennylane_lightning/core/src/simulators/lightning_kokkos/algorithms/AdjointJacobianKokkos.hpp; use copy constructor. M pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp; use copy constructor. M pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.hpp; use copy constructor. M requirements-dev.txt; add clang-format-14. * Auto update version * Update changelog. * Auto update version * Auto update version * Add an argument to adjointJacobian to avoid syncing and copying state vector data in adjoint-diff. * Reformat * trigger CI * [skip ci] Update changelog. * Auto update version * Auto update version * Accel/expval (#481) * Introduce std::unordered_map<std::string, ExpValFunc> expval_funcs_. * Introduce applyExpectationValueFunctor. * Add binding to LKokkos expval(matrix, wires). Combine expval functor calls into two templated methods. Call specialized expval methods when possible. Remove obsolete 'Apply directly' tests. * Update changelog. * Add test for arbitrary expval(Hermitian). * Add getExpectationValueMultiQubitOpFunctor. * Add typename hint for macos. * Add typename macos. * Use Kokkos::ThreadVectorRange policy for innerloop in getExpectationValueMultiQubitOpFunctor. * Couple fix for HIP. * Use inner product scheme instead of getExpectationValueMultiQubitOpFunctor to compute multi-qubit expval. --------- Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com> Co-authored-by: Amintor Dusko <[email protected]>
PennyLaneAI · Aug 25, 2023 · 1bd68b4 · 1bd68b4
1 parent e7922ff
commit 1bd68b4
Show file tree

Hide file tree

Showing 18 changed files with 267 additions and 406 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* The `lightning_kokkos` backend supports Nvidia GPU execution (with Kokkos v4 and CUDA v12). 
+  [(#477)](https://github.com/PennyLaneAI/pennylane-lightning/pull/477)
+
 * Complete overhaul of repository structure to facilitates integration of multiple backends. Refactoring efforts we directed to improve development performance, code reuse and decrease overall overhead to propagate changes through backends. New C++ modular build strategy allows for faster test builds restricted to a module. Update CI/CD actions concurrency strategy. Change minimal Python version to 3.9.
   [(#472)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/472)
 
@@ -13,8 +16,14 @@
 
 ### Breaking changes
 
+* Modify `adjointJacobian` methods to accept a (maybe unused) reference `StateVectorT`, allowing device-backed simulators to directly access state vector data for adjoint differentiation instead of copying it back-and-forth into `JacobianData` (host memory).
+  [(#477)](https://github.com/PennyLaneAI/pennylane-lightning/pull/477)
+
 ### Improvements
 
+* Refactor LKokkos `Measurements` class to use (fast) specialized functors whenever possible.
+  [(#481)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/481)
+
 * Merge Lightning Qubit and Lightning Kokkos backends in the new repository.
   [(#472)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/472)
 

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.32.0-dev12"
+__version__ = "0.32.0-dev13"
diff --git a/pennylane_lightning/core/src/algorithms/AdjointJacobianBase.hpp b/pennylane_lightning/core/src/algorithms/AdjointJacobianBase.hpp
@@ -157,8 +157,9 @@ template <class StateVectorT, class Derived> class AdjointJacobianBase {
      */
     inline void adjointJacobian(std::span<PrecisionT> jac,
                                 const JacobianData<StateVectorT> &jd,
+                                const StateVectorT &ref_data = {0},
                                 bool apply_operations = false) {
-        return static_cast<Derived *>(this)->adjointJacobian(jac, jd,
+        return static_cast<Derived *>(this)->adjointJacobian(jac, jd, ref_data,
                                                              apply_operations);
     }
 

diff --git a/pennylane_lightning/core/src/algorithms/tests/Test_AdjointJacobian.cpp b/pennylane_lightning/core/src/algorithms/tests/Test_AdjointJacobian.cpp
@@ -90,7 +90,7 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape{
                 num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
             PL_REQUIRE_THROWS_MATCHES(
-                adj.adjointJacobian(std::span{jacobian}, tape, true),
+                adj.adjointJacobian(std::span{jacobian}, tape, psi, true),
                 LightningException,
                 "The size of preallocated jacobian must be same as");
         }
@@ -116,7 +116,7 @@ template <typename TypeList> void testAdjointJacobian() {
                 JacobianData<StateVectorT> tape{
                     num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
                 REQUIRE_NOTHROW(
-                    adj.adjointJacobian(std::span{jacobian}, tape, true));
+                    adj.adjointJacobian(std::span{jacobian}, tape, psi, true));
             }
         }
 
@@ -141,7 +141,7 @@ template <typename TypeList> void testAdjointJacobian() {
 
                 JacobianData<StateVectorT> tape{
                     num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
-                adj.adjointJacobian(std::span{jacobian}, tape, true);
+                adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
                 CAPTURE(jacobian);
                 CHECK(-sin(p) == Approx(jacobian[0]));
@@ -169,7 +169,7 @@ template <typename TypeList> void testAdjointJacobian() {
 
                 JacobianData<StateVectorT> tape{
                     num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
-                adj.adjointJacobian(std::span{jacobian}, tape, true);
+                adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
                 CAPTURE(jacobian);
                 CHECK(cos(p) == Approx(jacobian[0]).margin(1e-7));
@@ -199,7 +199,7 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape{num_params,    psi.getLength(),
                                             psi.getData(), {obs1, obs2},
                                             ops,           tp};
-            adj.adjointJacobian(std::span{jacobian}, tape, true);
+            adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
             CAPTURE(jacobian);
             CHECK(-sin(param[0]) == Approx(jacobian[0]).margin(1e-7));
@@ -233,7 +233,7 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape{num_params,    psi.getLength(),
                                             psi.getData(), {obs1, obs2, obs3},
                                             ops,           tp};
-            adj.adjointJacobian(std::span{jacobian}, tape, true);
+            adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
             CAPTURE(jacobian);
             CHECK(-sin(param[0]) == Approx(jacobian[0]).margin(1e-7));
@@ -271,7 +271,7 @@ template <typename TypeList> void testAdjointJacobian() {
                                             psi.getData(), {obs1, obs2, obs3},
                                             ops,           t_params};
 
-            adj.adjointJacobian(std::span{jacobian}, tape, true);
+            adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
             CAPTURE(jacobian);
             CHECK(-sin(param[0]) == Approx(jacobian[0]).margin(1e-7));
@@ -307,7 +307,7 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape{
                 num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
 
-            adj.adjointJacobian(std::span{jacobian}, tape, true);
+            adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
             CAPTURE(jacobian);
 
@@ -353,7 +353,7 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape{
                 num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
 
-            adj.adjointJacobian(std::span{jacobian}, tape, true);
+            adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
             CAPTURE(jacobian);
 
@@ -405,7 +405,7 @@ template <typename TypeList> void testAdjointJacobian() {
 
                 JacobianData<StateVectorT> tape{
                     num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
-                adj.adjointJacobian(std::span{jacobian}, tape, true);
+                adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
                 CAPTURE(theta);
                 CAPTURE(jacobian);
@@ -473,7 +473,7 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape{
                 t_params.size(), psi.getLength(), psi.getData(), {obs}, ops,
                 t_params};
-            adj.adjointJacobian(std::span{jacobian}, tape, true);
+            adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
             std::vector<PrecisionT> expected{-0.71429188, 0.04998561,
                                              -0.71904837};
@@ -510,7 +510,7 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape{
                 num_params, psi.getLength(), psi.getData(), {ham}, ops, tp};
 
-            adj.adjointJacobian(std::span{jacobian}, tape, true);
+            adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
             CAPTURE(jacobian);
             CHECK(-0.3 * sin(param[0]) == Approx(jacobian[0]).margin(1e-7));
@@ -546,7 +546,7 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape{num_params,    psi.getLength(),
                                             psi.getData(), {ham},
                                             ops,           t_params};
-            adj.adjointJacobian(std::span{jacobian}, tape, true);
+            adj.adjointJacobian(std::span{jacobian}, tape, psi, true);
 
             CAPTURE(jacobian);
             CHECK((-0.47 * sin(param[0]) == Approx(jacobian[0]).margin(1e-7)));
@@ -588,8 +588,8 @@ template <typename TypeList> void testAdjointJacobian() {
             JacobianData<StateVectorT> tape2{num_params,    psi.getLength(),
                                              psi.getData(), {obs2},
                                              ops,           t_params};
-            adj.adjointJacobian(std::span{jacobian1}, tape1, true);
-            adj.adjointJacobian(std::span{jacobian2}, tape2, true);
+            adj.adjointJacobian(std::span{jacobian1}, tape1, psi, true);
+            adj.adjointJacobian(std::span{jacobian2}, tape2, psi, true);
 
             CHECK((jacobian1 == PLApprox(jacobian2).margin(1e-7)));
         }

diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp
@@ -468,9 +468,7 @@ auto registerAdjointJacobian(
                                         observables,
                                         operations,
                                         trainableParams};
-
-    adjoint_jacobian.adjointJacobian(std::span{jac}, jd);
-
+    adjoint_jacobian.adjointJacobian(std::span{jac}, jd, sv);
     return py::array_t<PrecisionT>(py::cast(jac));
 }
 

diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp
@@ -62,9 +62,9 @@ class StateVectorKokkos final
     using PrecisionT = fp_t;
     using ComplexT = Kokkos::complex<fp_t>;
     using KokkosExecSpace = Kokkos::DefaultExecutionSpace;
+    using HostExecSpace = Kokkos::DefaultHostExecutionSpace;
     using KokkosVector = Kokkos::View<ComplexT *>;
     using KokkosSizeTVector = Kokkos::View<size_t *>;
-    using KokkosRangePolicy = Kokkos::RangePolicy<KokkosExecSpace>;
     using UnmanagedComplexHostView =
         Kokkos::View<ComplexT *, Kokkos::HostSpace,
                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
@@ -81,12 +81,10 @@ class StateVectorKokkos final
         Kokkos::View<PrecisionT *, Kokkos::HostSpace,
                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
     using ScratchViewComplex =
-        Kokkos::View<ComplexT *,
-                     Kokkos::DefaultExecutionSpace::scratch_memory_space,
+        Kokkos::View<ComplexT *, KokkosExecSpace::scratch_memory_space,
                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
     using ScratchViewSizeT =
-        Kokkos::View<size_t *,
-                     Kokkos::DefaultExecutionSpace::scratch_memory_space,
+        Kokkos::View<size_t *, KokkosExecSpace::scratch_memory_space,
                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
     using TeamPolicy = Kokkos::TeamPolicy<>;
 
@@ -398,10 +396,14 @@ class StateVectorKokkos final
                             bool inverse = false) {
         PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0");
         size_t n = 1U << wires.size();
-        KokkosVector matrix_("matrix_", n * n);
-        for (size_t i = 0; i < n * n; i++) {
-            matrix_(i) = matrix[i];
-        }
+        size_t n2 = n * n;
+        KokkosVector matrix_("matrix_", n2);
+        typename KokkosVector::HostMirror matrix_h =
+            Kokkos::create_mirror_view(matrix_);
+        Kokkos::parallel_for(
+            Kokkos::RangePolicy<HostExecSpace>(0, n2),
+            KOKKOS_LAMBDA(const size_t i) { matrix_h(i) = matrix[i]; });
+        Kokkos::deep_copy(matrix_, matrix_h);
         applyMultiQubitOp(matrix_, wires, inverse);
     }
 
@@ -760,13 +762,14 @@ class StateVectorKokkos final
      * @brief Get underlying data vector
      */
     [[nodiscard]] auto getDataVector() -> std::vector<ComplexT> {
-        std::vector<ComplexT> data_(getData(), getData() + this->getLength());
+        std::vector<ComplexT> data_(this->getLength());
+        DeviceToHost(data_.data(), data_.size());
         return data_;
     }
 
     [[nodiscard]] auto getDataVector() const -> const std::vector<ComplexT> {
-        const std::vector<ComplexT> data_(getData(),
-                                          getData() + this->getLength());
+        std::vector<ComplexT> data_(this->getLength());
+        DeviceToHost(data_.data(), data_.size());
         return data_;
     }
 
@@ -782,7 +785,7 @@ class StateVectorKokkos final
      * @brief Copy data from the device space to the host space.
      *
      */
-    inline void DeviceToHost(ComplexT *sv, size_t length) {
+    inline void DeviceToHost(ComplexT *sv, size_t length) const {
         Kokkos::deep_copy(UnmanagedComplexHostView(sv, length), *data_);
     }
 

diff --git a/...ylane_lightning/core/src/simulators/lightning_kokkos/algorithms/AdjointJacobianKokkos.hpp b/...ylane_lightning/core/src/simulators/lightning_kokkos/algorithms/AdjointJacobianKokkos.hpp
@@ -83,6 +83,7 @@ class AdjointJacobian final
      */
     void adjointJacobian(std::span<PrecisionT> jac,
                          const JacobianData<StateVectorT> &jd,
+                         const StateVectorT &ref_data,
                          bool apply_operations = false) {
         const OpsData<StateVectorT> &ops = jd.getOperations();
         const std::vector<std::string> &ops_name = ops.getOpsName();
@@ -112,12 +113,8 @@ class AdjointJacobian final
         auto tp_it = tp.rbegin();
         const auto tp_rend = tp.rend();
 
-        StateVectorKokkos<PrecisionT> ref_data(jd.getPtrStateVec(),
-                                               jd.getSizeStateVec());
-
         // Create $U_{1:p}\vert \lambda \rangle$
-        StateVectorT lambda(ref_data.getNumQubits());
-        lambda.DeviceToDevice(ref_data.getView());
+        StateVectorT lambda{ref_data};
 
         // Apply given operations to statevector if requested
         if (apply_operations) {
@@ -129,7 +126,7 @@ class AdjointJacobian final
                                            StateVectorT(lambda.getNumQubits()));
         this->applyObservables(H_lambda, lambda, obs);
 
-        StateVectorT mu(lambda.getNumQubits());
+        StateVectorT mu{lambda.getNumQubits()};
 
         for (int op_idx = static_cast<int>(ops_name.size() - 1); op_idx >= 0;
              op_idx--) {

diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/bindings/LKokkosBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/bindings/LKokkosBindings.hpp
@@ -26,6 +26,7 @@
 #include "MeasurementsKokkos.hpp"
 #include "StateVectorKokkos.hpp"
 #include "TypeList.hpp"
+#include "Util.hpp" // exp2
 
 /// @cond DEV
 namespace {
@@ -34,6 +35,7 @@ using namespace Pennylane::LightningKokkos::Algorithms;
 using namespace Pennylane::LightningKokkos::Measures;
 using Kokkos::InitializationSettings;
 using Pennylane::LightningKokkos::StateVectorKokkos;
+using Pennylane::Util::exp2;
 } // namespace
 /// @endcond
 
@@ -164,6 +166,18 @@ void registerBackendSpecificMeasurements(PyClass &pyclass) {
                  const std::string &, const std::vector<size_t> &)>(
                  &Measurements<StateVectorT>::expval),
              "Expected value of an operation by name.")
+        .def(
+            "expval",
+            [](Measurements<StateVectorT> &M, const np_arr_c &matrix,
+               const std::vector<size_t> &wires) {
+                const std::size_t matrix_size = exp2(2 * wires.size());
+                auto matrix_data =
+                    static_cast<ComplexT *>(matrix.request().ptr);
+                std::vector<ComplexT> matrix_v{matrix_data,
+                                               matrix_data + matrix_size};
+                return M.expval(matrix_v, wires);
+            },
+            "Expected value of a Hermitian observable.")
         .def(
             "expval",
             [](Measurements<StateVectorT> &M, const np_arr_sparse_ind &row_map,

diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/gates/GateFunctorsNonparam.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/gates/GateFunctorsNonparam.hpp
@@ -173,7 +173,8 @@ template <class PrecisionT, bool inverse = false> struct sFunctor {
         rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
         wire_parity = fillTrailingOnes(rev_wire);
         wire_parity_inv = fillLeadingOnes(rev_wire + 1);
-        shift = (inverse) ? -Kokkos::complex(0, 1) : Kokkos::complex(0, 1);
+        shift =
+            (inverse) ? -Kokkos::complex{0.0, 1.0} : Kokkos::complex{0.0, 1.0};
     }
 
     KOKKOS_INLINE_FUNCTION