Optimize gate cache recording for lightning.tensor (#879)

### Before submitting Please complete the following checklist when submitting a PR: - [ ] All new features must include a unit test. If you've fixed a bug or added code that should be tested, add a test to the [`tests`](../tests) directory! - [ ] All new functions and code must be clearly commented and documented. If you do make documentation changes, make sure that the docs build and render correctly by running `make docs`. - [ ] Ensure that the test suite passes, by running `make test`. - [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing the change, and including a link back to the PR. - [x] Ensure that code is properly formatted by running `make format`. When all the above are checked, delete everything above the dashed line and fill in the pull request template. ------------------------------------------------------------------------------------------------------------ **Context:** [SC-72517] Current implementation of `applyOperation` avoid the overhead of a `cutensornetStateUpdateTensorOperator` call and the creation of a new `DataBuffer` object. A new `gate_ids_` private data is added for the quick generation of a new key, which does not exist in the `gate_cache`. **Description of the Change:** **Benefits:** **Possible Drawbacks:** **Related GitHub Issues:** --------- Co-authored-by: ringo-but-quantum <[email protected]> Co-authored-by: Luis Alfredo Nuñez Meneses <[email protected]>
PennyLaneAI · Aug 30, 2024 · b9d3334 · b9d3334
1 parent 756eb7b
commit b9d3334
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 22 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -36,8 +36,11 @@
 
 ### Improvements
 
+* Optimize gate cache recording for `lightning.tensor` C++ layer.
+  [(#879)](https://github.com/PennyLaneAI/pennylane-lightning/pull/879)
+
 * Updated calls of ``size_t`` to ``std::size_t`` everywhere.
-  [(#816)](https://github.com/PennyLaneAI/pennylane-lightning/pull/816/)
+  [(#816)](https://github.com/PennyLaneAI/pennylane-lightning/pull/816)
 
 * Update `ctrl_decomp_zyz` tests with `len(control_wires) > 1`.
   [(#821)](https://github.com/PennyLaneAI/pennylane-lightning/pull/821)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev52"
+__version__ = "0.38.0-dev53"
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
@@ -21,6 +21,7 @@
 
 #include <complex>
 #include <memory>
+#include <set>
 #include <type_traits>
 #include <vector>
 
@@ -70,6 +71,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
                                        // states as v24.03
 
     std::shared_ptr<TNCudaGateCache<PrecisionT>> gate_cache_;
+    std::set<int64_t> gate_ids_;
 
   public:
     TNCudaBase() = delete;
@@ -265,8 +267,18 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             "Unsupported gate: MPS method only supports 1, 2-wires gates");
 
         auto &&par = (params.empty()) ? std::vector<PrecisionT>{0.0} : params;
-        DataBuffer<PrecisionT, int> dummy_device_data(
-            Pennylane::Util::exp2(wires.size()), getDevTag());
+
+        int64_t dummy_id = gate_ids_.empty() ? 1 : *gate_ids_.rbegin() + 1;
+
+        if (gate_matrix.empty()) [[likely]] {
+            gate_cache_->add_gate(dummy_id, opName, par, adjoint);
+        } else [[unlikely]] {
+            auto gate_key = std::make_pair(opName, par);
+            std::vector<CFP_t> matrix_cu =
+                cuUtil::complexToCu<ComplexT>(gate_matrix);
+            gate_cache_->add_gate(dummy_id, gate_key, matrix_cu, adjoint);
+        }
+
         int64_t id;
 
         std::vector<int32_t> stateModes =
@@ -284,30 +296,19 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /* cutensornetState_t */ getQuantumState(),
             /* int32_t numStateModes */ stateModes.size(),
             /* const int32_t * stateModes */ stateModes.data(),
-            /* void * */ static_cast<void *>(dummy_device_data.getData()),
+            /* void * */
+            static_cast<void *>(gate_cache_->get_gate_device_ptr(dummy_id)),
             /* const int64_t *tensorModeStrides */ nullptr,
             /* const int32_t immutable */ 0,
             /* const int32_t adjoint */ 0,
             /* const int32_t unitary */ 1,
             /* int64_t * */ &id));
-        if (!gate_matrix.empty()) {
-            auto gate_key = std::make_pair(opName, par);
-            std::vector<CFP_t> matrix_cu =
-                cuUtil::complexToCu<ComplexT>(gate_matrix);
-            gate_cache_->add_gate(static_cast<std::size_t>(id), gate_key,
-                                  matrix_cu, adjoint);
-        } else {
-            gate_cache_->add_gate(static_cast<std::size_t>(id), opName, par,
-                                  adjoint);
+
+        if (dummy_id != id) {
+            gate_cache_->update_key(dummy_id, id);
         }
-        PL_CUTENSORNET_IS_SUCCESS(cutensornetStateUpdateTensorOperator(
-            /* const cutensornetHandle_t */ getTNCudaHandle(),
-            /* cutensornetState_t */ getQuantumState(),
-            /* int64_t tensorId*/ id,
-            /* void* */
-            static_cast<void *>(
-                gate_cache_->get_gate_device_ptr(static_cast<std::size_t>(id))),
-            /* int32_t unitary*/ 1));
+
+        gate_ids_.insert(id);
     }
 
     /**

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
@@ -173,6 +173,18 @@ template <class PrecisionT> class TNCudaGateCache {
      */
     auto is_empty() const -> bool { return device_gates_.empty(); }
 
+    /**
+     * @brief Update an existing key with a new one.
+     *
+     * @param old_key The old key to be updated.
+     * @param new_key The new key to be updated.
+     */
+    void update_key(const std::size_t old_key, const std::size_t new_key) {
+        auto it = device_gates_.extract(old_key);
+        it.key() = new_key;
+        device_gates_.insert(std::move(it));
+    }
+
   private:
     const DevTag<int> device_tag_;
     std::size_t total_alloc_bytes_;

diff --git a/...tning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp b/...tning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
@@ -120,6 +120,33 @@ TEMPLATE_TEST_CASE("MPSTNCuda::Gates::PauliX", "[MPSTNCuda_Nonparam]", float,
     }
 }
 
+TEMPLATE_TEST_CASE("MPSTNCuda::Gates::applyOperation-gatematrix",
+                   "[MPSTNCuda_Nonparam]", float, double) {
+    std::size_t num_qubits = 3;
+    std::size_t maxExtent = 2;
+    DevTag<int> dev_tag{0, 0};
+
+    SECTION("Apply different wire indices") {
+        const std::size_t index = GENERATE(0, 1, 2);
+        MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
+
+        std::vector<std::complex<TestType>> gate_matrix = {
+            cuUtil::ZERO<std::complex<TestType>>(),
+            cuUtil::ONE<std::complex<TestType>>(),
+            cuUtil::ONE<std::complex<TestType>>(),
+            cuUtil::ZERO<std::complex<TestType>>()};
+
+        mps_state.applyOperation("applyMatrix", {index}, false, {},
+                                 gate_matrix);
+
+        auto results = mps_state.getDataVector();
+
+        CHECK(results[0] == cuUtil::ZERO<std::complex<TestType>>());
+        CHECK(results[0b1 << (num_qubits - index - 1)] ==
+              cuUtil::ONE<std::complex<TestType>>());
+    }
+}
+
 TEMPLATE_TEST_CASE("MPSTNCuda::Gates::PauliY", "[MPSTNCuda_Nonparam]", float,
                    double) {
     const bool inverse = GENERATE(false, true);