Merge branch 'master' into maa-lk-decom-fixes

PennyLaneAI · Jun 7, 2024 · 4dc2f7f · 4dc2f7f
2 parents d4944fd + 1783b8b
commit 4dc2f7f
Show file tree

Hide file tree

Showing 19 changed files with 282 additions and 425 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -1,6 +1,9 @@
 # Release 0.37.0-dev
 
 ### New features since last release
+* Add `inverse` support for gate operations in `lightning.tensor` in the C++ layer.
+  [(#753)](https://github.com/PennyLaneAI/pennylane-lightning/pull/753) 
+
 * Add `observable` and `expval` support to `cutensornet` backed `lightning.tensor` C++ layer.
   [(#728)](https://github.com/PennyLaneAI/pennylane-lightning/pull/728)
 
@@ -29,6 +32,12 @@
 
 ### Improvements
 
+* Refactor C++ library names for `lightning.tensor`.
+  [(#755)](https://github.com/PennyLaneAI/pennylane-lightning/pull/755) 
+
+* Set `state_tensor` as `const` for the `MeasurementTNCuda` class.
+  [(#753)](https://github.com/PennyLaneAI/pennylane-lightning/pull/753) 
+
 * Updated Kokkos version and support to 4.3.01.
   [(#725)](https://github.com/PennyLaneAI/pennylane-lightning/pull/725)
 
@@ -67,6 +76,9 @@
 
 ### Bug fixes
 
+* `lightning.qubit` and `lightning.kokkos` use `qml.ops.Conditional.base` instead of `qml.ops.Conditional.then_op`.
+  [(#752)](https://github.com/PennyLaneAI/pennylane-lightning/pull/752)
+
 * Fix AVX streaming operation support with newer GCC.
   [(#729)](https://github.com/PennyLaneAI/pennylane-lightning/pull/729)
 

diff --git a/.github/workflows/tests_lkcuda_python.yml b/.github/workflows/tests_lkcuda_python.yml
@@ -102,6 +102,7 @@ jobs:
         with:
           key: ${{ matrix.os }}-kokkos${{ matrix.kokkos_version }}-${{ matrix.exec_model }}-${{ github.ref }}-${{ github.sha }}
           path: ${{ github.workspace }}/Kokkos_install/${{ matrix.exec_model }}
+          retention-days: 1
 
   pythontestswithKokkos:
     needs: [builddeps]

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/CMakeLists.txt
@@ -19,7 +19,7 @@ findCutensornet(lightning_external_libs)
 
 set(LTENSOR_MPS_FILES  MPSTNCuda.cpp CACHE INTERNAL "" FORCE)
 
-add_library(${PL_TENSOR} STATIC ${LTENSOR_MPS_FILES})
+add_library(${PL_BACKEND} STATIC ${LTENSOR_MPS_FILES})
 
 ##########################
 ## Enforce C++ Standard ##
@@ -35,23 +35,23 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 #########################
 option(PL_DISABLE_CUDA_SAFETY "Build without CUDA call safety checks" OFF)
 
-target_link_libraries(${PL_TENSOR} PUBLIC  lightning_compile_options
+target_link_libraries(${PL_BACKEND} PUBLIC  lightning_compile_options
                                             lightning_external_libs
                                             ${PL_BACKEND}_gates
                                             ${PL_BACKEND}_tensor
                                             ${PL_BACKEND}_tensornetBase
                                             ${PL_BACKEND}_utils
                                             )
 
-target_include_directories(${PL_TENSOR} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-set_property(TARGET ${PL_TENSOR} PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(${PL_BACKEND} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+set_property(TARGET ${PL_BACKEND} PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 # To avoid DSO errors on platforms preferring static linkage, uncomment the following line:
 # string(REPLACE "libcudart_static.a" "libcudart.so" CUDA_SHARED_RT "${CUDA_LIBRARIES}")
-set_target_properties(${PL_TENSOR} PROPERTIES INSTALL_RPATH "$ORIGIN/../cuquantum/lib:$ORIGIN/../cuquantum/lib64:$ORIGIN/")
+set_target_properties(${PL_BACKEND} PROPERTIES INSTALL_RPATH "$ORIGIN/../cuquantum/lib:$ORIGIN/../cuquantum/lib64:$ORIGIN/")
 
 if(PL_DISABLE_CUDA_SAFETY)
-    target_compile_options(${PL_TENSOR} INTERFACE $<$<COMPILE_LANGUAGE:CXX>:-DCUDA_UNSAFE>)
+    target_compile_options(${PL_BACKEND} INTERFACE $<$<COMPILE_LANGUAGE:CXX>:-DCUDA_UNSAFE>)
 endif()
 
 ###############################################################################

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
@@ -82,6 +82,8 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
   public:
     MPSTNCuda() = delete;
 
+    // TODO: Add method to the constructor to allow users to select methods at
+    // runtime in the C++ layer
     explicit MPSTNCuda(const std::size_t numQubits,
                        const std::size_t maxBondDim)
         : BaseType(numQubits), maxBondDim_(maxBondDim),
@@ -90,6 +92,8 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
         initTensors_();
     }
 
+    // TODO: Add method to the constructor to allow users to select methods at
+    // runtime in the C++ layer
     explicit MPSTNCuda(const std::size_t numQubits,
                        const std::size_t maxBondDim, DevTag<int> dev_tag)
         : BaseType(numQubits, dev_tag), maxBondDim_(maxBondDim),

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
@@ -72,6 +72,8 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
   public:
     TNCudaBase() = delete;
 
+    // TODO: Add method to the constructor to all user to select methods at
+    // runtime in the C++ layer
     explicit TNCudaBase(const std::size_t numQubits, int device_id = 0,
                         cudaStream_t stream_id = 0)
         : BaseType(numQubits), handle_(make_shared_tncuda_handle()),
@@ -98,6 +100,8 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /*  cutensornetState_t * */ &quantumState_));
     }
 
+    // TODO: Add method to the constructor to all user to select methods at
+    // runtime in the C++ layer
     explicit TNCudaBase(const std::size_t numQubits, DevTag<int> dev_tag)
         : BaseType(numQubits), handle_(make_shared_tncuda_handle()),
           dev_tag_(dev_tag),
@@ -236,6 +240,13 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
                         bool adjoint = false,
                         const std::vector<PrecisionT> &params = {0.0},
                         const std::vector<ComplexT> &gate_matrix = {}) {
+        // TODO: Need to revisit this line of code for the exact TN backend.
+        //  We should be able to turn on/ skip this check based on the backend,
+        //  if(getMethod() == "mps") { ... }
+        PL_ABORT_IF(
+            wires.size() > 2,
+            "Unsupported gate: MPS method only supports 1, 2-wires gates");
+
         auto &&par = (params.empty()) ? std::vector<PrecisionT>{0.0} : params;
         DataBuffer<PrecisionT, int> dummy_device_data(
             Pennylane::Util::exp2(wires.size()), getDevTag());
@@ -259,17 +270,18 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /* void * */ static_cast<void *>(dummy_device_data.getData()),
             /* const int64_t *tensorModeStrides */ nullptr,
             /* const int32_t immutable */ 1,
-            /* const int32_t adjoint */ adjoint,
+            /* const int32_t adjoint */ 0,
             /* const int32_t unitary */ 1,
             /* int64_t * */ &id));
         if (!gate_matrix.empty()) {
             auto gate_key = std::make_pair(opName, par);
             std::vector<CFP_t> matrix_cu =
                 cuUtil::complexToCu<ComplexT>(gate_matrix);
             gate_cache_->add_gate(static_cast<std::size_t>(id), gate_key,
-                                  matrix_cu);
+                                  matrix_cu, adjoint);
         } else {
-            gate_cache_->add_gate(static_cast<std::size_t>(id), opName, par);
+            gate_cache_->add_gate(static_cast<std::size_t>(id), opName, par,
+                                  adjoint);
         }
         PL_CUTENSORNET_IS_SUCCESS(cutensornetStateUpdateTensorOperator(
             /* const cutensornetHandle_t */ getTNCudaHandle(),

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/base/tests/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/base/tests/CMakeLists.txt
@@ -20,7 +20,7 @@ target_link_libraries(tensornet_base_tests INTERFACE  Catch2::Catch2)
 ProcessTestOptions(tensornet_base_tests)
 
 # Create dependency on the dynamically defined simulator/backend target.
-target_link_libraries(tensornet_base_tests INTERFACE ${PL_TENSOR} ${PL_TENSOR}_utils)
+target_link_libraries(tensornet_base_tests INTERFACE ${PL_BACKEND} ${PL_BACKEND}_tncuda_utils)
 
 target_sources(tensornet_base_tests INTERFACE runner_lightning_tensor_tensornetBase.cpp)
 

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
@@ -73,15 +73,19 @@ template <class PrecisionT> class TNCudaGateCache {
      * @param gate_name String representing the name of the given gate.
      * @param gate_param Vector of parameter values. `{}` if non-parametric
      * gate.
+     * @param adjoint Boolean value indicating whether the adjoint of the gate
+     * is to be appended. The default is false.
      */
     void add_gate(const std::size_t gate_id, const std::string &gate_name,
-                  [[maybe_unused]] std::vector<PrecisionT> gate_param = {}) {
+                  [[maybe_unused]] std::vector<PrecisionT> gate_param = {},
+                  bool adjoint = false) {
         auto gate_key = std::make_pair(gate_name, gate_param);
 
         auto &gateMap =
             cuGates::DynamicGateDataAccess<PrecisionT>::getInstance();
 
-        add_gate(gate_id, gate_key, gateMap.getGateData(gate_name, gate_param));
+        add_gate(gate_id, gate_key, gateMap.getGateData(gate_name, gate_param),
+                 adjoint);
     }
     /**
      * @brief Add gate numerical value to the cache, indexed by the id of gate
@@ -93,10 +97,12 @@ template <class PrecisionT> class TNCudaGateCache {
      * its associated parameter value.
      * @param gate_data_host Vector of complex floating point values
      * representing the gate data on host.
+     * @param adjoint Boolean value indicating whether the adjoint of the gate
+     * is to be appended. The default is false.
      */
-
     void add_gate(const std::size_t gate_id, gate_key_info gate_key,
-                  const std::vector<CFP_t> &gate_data_host) {
+                  const std::vector<CFP_t> &gate_data_host,
+                  bool adjoint = false) {
         const std::size_t rank = Pennylane::Util::log2(gate_data_host.size());
         auto modes = std::vector<std::size_t>(rank, 0);
         auto extents = std::vector<std::size_t>(rank, 2);
@@ -108,8 +114,34 @@ template <class PrecisionT> class TNCudaGateCache {
             std::piecewise_construct, std::forward_as_tuple(gate_id),
             std::forward_as_tuple(gate_key, std::move(tensor)));
 
-        device_gates_.at(gate_id).second.getDataBuffer().CopyHostDataToGpu(
-            gate_data_host.data(), gate_data_host.size());
+        if (adjoint) {
+            // TODO: This is a temporary solution for gates data transpose.
+            // There should be a better way to handle this, but there is not
+            // a big performance issue for now since the size of gates is small.
+            // TODO: The implementation here can be optimized by generating the
+            // data buffer directly on the device instead of performing the
+            // transpose operation here
+            std::vector<CFP_t> data_host_transpose(gate_data_host.size());
+
+            const std::size_t col_size = 1 << (rank / 2);
+            const std::size_t row_size = col_size;
+
+            PL_ASSERT(col_size * row_size == gate_data_host.size());
+
+            for (std::size_t idx = 0; idx < gate_data_host.size(); idx++) {
+                std::size_t col = idx / row_size;
+                std::size_t row = idx % row_size;
+
+                data_host_transpose.at(row * col_size + col) = {
+                    gate_data_host.at(idx).x, -gate_data_host.at(idx).y};
+            }
+
+            device_gates_.at(gate_id).second.getDataBuffer().CopyHostDataToGpu(
+                data_host_transpose.data(), data_host_transpose.size());
+        } else {
+            device_gates_.at(gate_id).second.getDataBuffer().CopyHostDataToGpu(
+                gate_data_host.data(), gate_data_host.size());
+        }
 
         total_alloc_bytes_ += (sizeof(CFP_t) * gate_data_host.size());
     }

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/CMakeLists.txt
@@ -17,7 +17,7 @@ FetchAndIncludeCatch()
 add_library(${PL_BACKEND}_gates_tests INTERFACE)
 target_link_libraries(${PL_BACKEND}_gates_tests INTERFACE     Catch2::Catch2
                                                                 ${PL_BACKEND}_gates
-                                                                ${PL_TENSOR}
+                                                                ${PL_BACKEND}
                                                                 )
 
 ProcessTestOptions(${PL_BACKEND}_gates_tests)