From b9d33349bdeac878cd9c949bf49fa2a371cf90df Mon Sep 17 00:00:00 2001
From: Shuli Shu <31480676+multiphaseCFD@users.noreply.github.com>
Date: Fri, 30 Aug 2024 14:12:18 -0400
Subject: [PATCH] Optimize  gate cache recording for `lightning.tensor` (#879)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Before submitting

Please complete the following checklist when submitting a PR:

- [ ] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [ ] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [ ] Ensure that the test suite passes, by running `make test`.

- [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [x] Ensure that code is properly formatted by running `make format`.

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**

[SC-72517]

Current implementation of `applyOperation` avoid the overhead of a
`cutensornetStateUpdateTensorOperator` call and the creation of a new
`DataBuffer` object. A new `gate_ids_` private data is added for the
quick generation of a new key, which does not exist in the `gate_cache`.

**Description of the Change:**

**Benefits:**

**Possible Drawbacks:**

**Related GitHub Issues:**

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Co-authored-by: Luis Alfredo Nuñez Meneses <alfredo.nunez@xanadu.ai>
---
 .github/CHANGELOG.md                          |  5 ++-
 pennylane_lightning/core/_version.py          |  2 +-
 .../lightning_tensor/tncuda/TNCudaBase.hpp    | 41 ++++++++++---------
 .../tncuda/gates/TNCudaGateCache.hpp          | 12 ++++++
 .../gates/tests/Test_MPSTNCuda_NonParam.cpp   | 27 ++++++++++++
 5 files changed, 65 insertions(+), 22 deletions(-)
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 18cd6b8aff..1c791e1ee9 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -36,8 +36,11 @@
 
 ### Improvements
 
+* Optimize gate cache recording for `lightning.tensor` C++ layer.
+  [(#879)](https://github.com/PennyLaneAI/pennylane-lightning/pull/879)
+
 * Updated calls of ``size_t`` to ``std::size_t`` everywhere.
-  [(#816)](https://github.com/PennyLaneAI/pennylane-lightning/pull/816/)
+  [(#816)](https://github.com/PennyLaneAI/pennylane-lightning/pull/816)
 
 * Update `ctrl_decomp_zyz` tests with `len(control_wires) > 1`.
   [(#821)](https://github.com/PennyLaneAI/pennylane-lightning/pull/821)
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 30103c25ab..d7b122382e 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev52"
+__version__ = "0.38.0-dev53"
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
index b545ef890d..df3594ebc2 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
@@ -21,6 +21,7 @@
 
 #include <complex>
 #include <memory>
+#include <set>
 #include <type_traits>
 #include <vector>
 
@@ -70,6 +71,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
                                        // states as v24.03
 
     std::shared_ptr<TNCudaGateCache<PrecisionT>> gate_cache_;
+    std::set<int64_t> gate_ids_;
 
   public:
     TNCudaBase() = delete;
@@ -265,8 +267,18 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             "Unsupported gate: MPS method only supports 1, 2-wires gates");
 
         auto &&par = (params.empty()) ? std::vector<PrecisionT>{0.0} : params;
-        DataBuffer<PrecisionT, int> dummy_device_data(
-            Pennylane::Util::exp2(wires.size()), getDevTag());
+
+        int64_t dummy_id = gate_ids_.empty() ? 1 : *gate_ids_.rbegin() + 1;
+
+        if (gate_matrix.empty()) [[likely]] {
+            gate_cache_->add_gate(dummy_id, opName, par, adjoint);
+        } else [[unlikely]] {
+            auto gate_key = std::make_pair(opName, par);
+            std::vector<CFP_t> matrix_cu =
+                cuUtil::complexToCu<ComplexT>(gate_matrix);
+            gate_cache_->add_gate(dummy_id, gate_key, matrix_cu, adjoint);
+        }
+
         int64_t id;
 
         std::vector<int32_t> stateModes =
@@ -284,30 +296,19 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /* cutensornetState_t */ getQuantumState(),
             /* int32_t numStateModes */ stateModes.size(),
             /* const int32_t * stateModes */ stateModes.data(),
-            /* void * */ static_cast<void *>(dummy_device_data.getData()),
+            /* void * */
+            static_cast<void *>(gate_cache_->get_gate_device_ptr(dummy_id)),
             /* const int64_t *tensorModeStrides */ nullptr,
             /* const int32_t immutable */ 0,
             /* const int32_t adjoint */ 0,
             /* const int32_t unitary */ 1,
             /* int64_t * */ &id));
-        if (!gate_matrix.empty()) {
-            auto gate_key = std::make_pair(opName, par);
-            std::vector<CFP_t> matrix_cu =
-                cuUtil::complexToCu<ComplexT>(gate_matrix);
-            gate_cache_->add_gate(static_cast<std::size_t>(id), gate_key,
-                                  matrix_cu, adjoint);
-        } else {
-            gate_cache_->add_gate(static_cast<std::size_t>(id), opName, par,
-                                  adjoint);
+
+        if (dummy_id != id) {
+            gate_cache_->update_key(dummy_id, id);
         }
-        PL_CUTENSORNET_IS_SUCCESS(cutensornetStateUpdateTensorOperator(
-            /* const cutensornetHandle_t */ getTNCudaHandle(),
-            /* cutensornetState_t */ getQuantumState(),
-            /* int64_t tensorId*/ id,
-            /* void* */
-            static_cast<void *>(
-                gate_cache_->get_gate_device_ptr(static_cast<std::size_t>(id))),
-            /* int32_t unitary*/ 1));
+
+        gate_ids_.insert(id);
     }
 
     /**
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
index 30e5d824f1..d1d08e266e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
@@ -173,6 +173,18 @@ template <class PrecisionT> class TNCudaGateCache {
      */
     auto is_empty() const -> bool { return device_gates_.empty(); }
 
+    /**
+     * @brief Update an existing key with a new one.
+     *
+     * @param old_key The old key to be updated.
+     * @param new_key The new key to be updated.
+     */
+    void update_key(const std::size_t old_key, const std::size_t new_key) {
+        auto it = device_gates_.extract(old_key);
+        it.key() = new_key;
+        device_gates_.insert(std::move(it));
+    }
+
   private:
     const DevTag<int> device_tag_;
     std::size_t total_alloc_bytes_;
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
index 8718cb1934..83d3d73230 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
@@ -120,6 +120,33 @@ TEMPLATE_TEST_CASE("MPSTNCuda::Gates::PauliX", "[MPSTNCuda_Nonparam]", float,
     }
 }
 
+TEMPLATE_TEST_CASE("MPSTNCuda::Gates::applyOperation-gatematrix",
+                   "[MPSTNCuda_Nonparam]", float, double) {
+    std::size_t num_qubits = 3;
+    std::size_t maxExtent = 2;
+    DevTag<int> dev_tag{0, 0};
+
+    SECTION("Apply different wire indices") {
+        const std::size_t index = GENERATE(0, 1, 2);
+        MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
+
+        std::vector<std::complex<TestType>> gate_matrix = {
+            cuUtil::ZERO<std::complex<TestType>>(),
+            cuUtil::ONE<std::complex<TestType>>(),
+            cuUtil::ONE<std::complex<TestType>>(),
+            cuUtil::ZERO<std::complex<TestType>>()};
+
+        mps_state.applyOperation("applyMatrix", {index}, false, {},
+                                 gate_matrix);
+
+        auto results = mps_state.getDataVector();
+
+        CHECK(results[0] == cuUtil::ZERO<std::complex<TestType>>());
+        CHECK(results[0b1 << (num_qubits - index - 1)] ==
+              cuUtil::ONE<std::complex<TestType>>());
+    }
+}
+
 TEMPLATE_TEST_CASE("MPSTNCuda::Gates::PauliY", "[MPSTNCuda_Nonparam]", float,
                    double) {
     const bool inverse = GENERATE(false, true);