From e96a53fa9789253b4a766c86b078f5cbf62d603f Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Date: Thu, 7 Sep 2023 10:35:37 -0400
Subject: [PATCH] Template/expval (#489)

* M  pennylane_lightning/core/src/bindings/Bindings.hpp; hack `JacobianData` to work with devices.
M  pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp; `applyMatrix` bugfix: use intermediate hostview to copy matrix data; same bugfix for `getDataVector`.
M  pennylane_lightning/core/src/simulators/lightning_kokkos/algorithms/AdjointJacobianKokkos.hpp; use copy constructor.
M  pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp; use copy constructor.
M  pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.hpp; use copy constructor.
M  requirements-dev.txt; add clang-format-14.

* Auto update version

* Update changelog.

* Auto update version

* Auto update version

* Add an argument to adjointJacobian to avoid syncing and copying state vector data in adjoint-diff.

* Reformat

* trigger CI

* [skip ci] Update changelog.

* Introduce std::unordered_map<std::string, ExpValFunc> expval_funcs_.

* Introduce applyExpectationValueFunctor.

* Add binding to LKokkos expval(matrix, wires). Combine expval functor calls into two templated methods. Call specialized expval methods when possible. Remove obsolete 'Apply directly' tests.

* Update changelog.

* Add test for arbitrary expval(Hermitian).

* Add getExpectationValueMultiQubitOpFunctor.

* Add typename hint for macos.

* Add typename macos.

* Use Kokkos::ThreadVectorRange policy for innerloop in getExpectationValueMultiQubitOpFunctor.

* Auto update version

* Auto update version

* Couple fix for HIP.

* WIP

* Add specialized 3-5 qubit expval functors.

* Add implementation notes.

* [skip ci] Polish getExpValMatrix and limit getExpValNQubitOpFunctor to N == 4 (seems optimal on OPENMP/HIP backends although CUDA can go further).

* Fix whitespace warning.

* Auto update version

* Use simplified bitshifts in 1- & 2-qubit expval kernels.

* Update tests.

* Update changelog.

* Bump pennylane version.

* Auto update version

* Reimplement expval functors with macros.

* Auto update version

* Update CHANGELOG.md

* Auto update version

* trigger CI

* trigger CI

* Bump Kokkos to 4.1.00 in CI.

* Revert kokkos ver.

* Add tests for macroed expval functors.

* Remove redundant black lines.

* Use matrix interface to get expval of HermitianObs in LKokkos.

* Cover kokkos_args error.

---------

Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>
---
 .github/CHANGELOG.md                          |   3 +
 pennylane_lightning/core/_version.py          |   2 +-
 .../lightning_kokkos/gates/README.md          |  75 +++
 .../measurements/CMakeLists.txt               |   1 +
 .../measurements/ExpValFunctors.hpp           | 478 ++++++++++++++----
 .../measurements/MeasurementsKokkos.hpp       |  65 ++-
 .../tests/Test_StateVectorKokkos_Expval.cpp   | 154 ++++++
 .../lightning_kokkos/lightning_kokkos.py      |   2 +-
 tests/test_device.py                          |   9 +
 tests/test_expval.py                          |  10 +-
 tests/test_gates.py                           |  27 +
 11 files changed, 709 insertions(+), 117 deletions(-)
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_kokkos/gates/README.md

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 9d081c9de2..b165f9e656 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -12,6 +12,9 @@
 
 ### Improvements
 
+* Refactor LKokkos `Measurements` class to use Kokkos `RangePolicy` together with special functors to obtain the expectation value of 1- to 4-wire generic unitary gates. For more than 4 wires, the general implementation using Kokkos `TeamPolicy` is employed to yield the best all-around performance. 
+  [(#489)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/489)
+
 * Add tests to increase LKokkos coverage.
   [(#485)](https://github.com/PennyLaneAI/pennylane-lightning/pull/485)
 
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 57254db4dd..22113c5ec6 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.33.0-dev3"
+__version__ = "0.33.0-dev4"
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/gates/README.md b/pennylane_lightning/core/src/simulators/lightning_kokkos/gates/README.md
new file mode 100644
index 0000000000..1957e4c206
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/gates/README.md
@@ -0,0 +1,75 @@
+# Implementation
+
+## Expval kernels
+
+The general multi-qubit operator kernel requires a private variable `coeffs_in` to store state vector coefficients.
+In the Kokkos framework, this variable cannot be a private member of the functor, say, because all threads will overwrite it.
+One must then use `TeamPolicy`s together with `scratch_memory_space` which allows creating and manipulating thread-local variables.
+This implementation however appears suboptimal compared with the straightforward `RangePolicy` with bit-injection one.
+
+The last being more verbose, it is only implemented for 1- and 2-qubit observables.
+It is however possible to generate the code automatically for higher qubit counts with the following Python script. 
+
+```python
+for n_wires in range(1, 6):
+    name = f"getExpVal{n_wires}QubitOpFunctor"
+    nU = 2**n_wires
+
+    print(
+        f"""template <class PrecisionT, std::size_t n_wires> struct {name} {{
+        
+        using ComplexT = Kokkos::complex<PrecisionT>;
+        using KokkosComplexVector = Kokkos::View<ComplexT *>;
+        using KokkosIntVector = Kokkos::View<std::size_t *>;
+
+        KokkosComplexVector arr;
+        KokkosComplexVector matrix;
+        KokkosIntVector wires;
+        std::size_t dim;
+        std::size_t num_qubits;
+
+        {name}(const KokkosComplexVector &arr_,
+        const std::size_t num_qubits_,
+                                    const KokkosComplexVector &matrix_,
+                                    const KokkosIntVector &wires_) {{
+            wires = wires_;
+            arr = arr_;
+            matrix = matrix_;
+            num_qubits = num_qubits_;
+            dim = 1U << wires.size();
+        }}
+        
+        KOKKOS_INLINE_FUNCTION
+        void operator()(const std::size_t k, PrecisionT &expval) const {{
+        const std::size_t kdim = k * dim;
+    """
+    )
+
+    for k in range(nU):
+        print(
+            f"""
+        std::size_t i{k:0{n_wires}b} = kdim | {k};
+        for (std::size_t pos = 0; pos < n_wires; pos++) {{
+            std::size_t x =
+                ((i{k:0{n_wires}b} >> (n_wires - pos - 1)) ^
+                    (i{k:0{n_wires}b} >> (num_qubits - wires(pos) - 1))) &
+                1U;
+            i{k:0{n_wires}b} = i{k:0{n_wires}b} ^ ((x << (n_wires - pos - 1)) |
+                            (x << (num_qubits - wires(pos) - 1)));
+        }}
+        """
+        )
+
+    # print("expval += real(")
+    for k in range(nU):
+        tmp = f"expval += real(conj(arr(i{k:0{n_wires}b})) * ("
+        tmp += f"matrix(0B{k:0{n_wires}b}{0:0{n_wires}b}) * arr(i{0:0{n_wires}b})"
+        for j in range(1, nU):
+            tmp += (
+                f" + matrix(0B{k:0{n_wires}b}{j:0{n_wires}b}) * arr(i{j:0{n_wires}b})"
+            )
+        print(tmp, end="")
+        print("));")
+    print("}")
+    print("};")
+```
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/CMakeLists.txt
index 47050770ca..6c8effc873 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/CMakeLists.txt
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/CMakeLists.txt
@@ -11,6 +11,7 @@ target_link_libraries(${PL_BACKEND}_measurements INTERFACE  lightning_compile_op
                                                             lightning_observables
                                                             lightning_utils
                                                             ${PL_BACKEND}
+                                                            ${PL_BACKEND}_observables
                                                             ${PL_BACKEND}_utils
                                                             )
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/ExpValFunctors.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/ExpValFunctors.hpp
index 8cb1eded63..89972d05c2 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/ExpValFunctors.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/ExpValFunctors.hpp
@@ -161,102 +161,7 @@ template <class PrecisionT> struct getExpectationValueHadamardFunctor {
     }
 };
 
-template <class PrecisionT> struct getExpectationValueSingleQubitOpFunctor {
-    Kokkos::View<Kokkos::complex<PrecisionT> *> arr;
-    Kokkos::View<Kokkos::complex<PrecisionT> *> matrix;
-
-    std::size_t rev_wire;
-    std::size_t rev_wire_shift;
-    std::size_t wire_parity;
-    std::size_t wire_parity_inv;
-
-    getExpectationValueSingleQubitOpFunctor(
-        Kokkos::View<Kokkos::complex<PrecisionT> *> &arr_,
-        std::size_t num_qubits,
-        const Kokkos::View<Kokkos::complex<PrecisionT> *> &matrix_,
-        const std::vector<size_t> &wires) {
-        arr = arr_;
-        matrix = matrix_;
-        rev_wire = num_qubits - wires[0] - 1;
-        rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
-        wire_parity = fillTrailingOnes(rev_wire);
-        wire_parity_inv = fillLeadingOnes(rev_wire + 1);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const std::size_t k, PrecisionT &expval) const {
-        const std::size_t i0 =
-            ((k << 1U) & wire_parity_inv) | (wire_parity & k);
-        const std::size_t i1 = i0 | rev_wire_shift;
-
-        expval += real(
-            conj(arr[i0]) * (matrix[0B00] * arr[i0] + matrix[0B01] * arr[i1]) +
-            conj(arr[i1]) * (matrix[0B10] * arr[i0] + matrix[0B11] * arr[i1]));
-    }
-};
-
-template <class PrecisionT> struct getExpectationValueTwoQubitOpFunctor {
-    Kokkos::View<Kokkos::complex<PrecisionT> *> arr;
-    Kokkos::View<Kokkos::complex<PrecisionT> *> matrix;
-
-    std::size_t rev_wire0;
-    std::size_t rev_wire1;
-    std::size_t rev_wire0_shift;
-    std::size_t rev_wire1_shift;
-    std::size_t rev_wire_min;
-    std::size_t rev_wire_max;
-    std::size_t parity_low;
-    std::size_t parity_high;
-    std::size_t parity_middle;
-
-    getExpectationValueTwoQubitOpFunctor(
-        Kokkos::View<Kokkos::complex<PrecisionT> *> &arr_,
-        std::size_t num_qubits,
-        const Kokkos::View<Kokkos::complex<PrecisionT> *> &matrix_,
-        const std::vector<size_t> &wires) {
-        rev_wire0 = num_qubits - wires[1] - 1;
-        rev_wire1 = num_qubits - wires[0] - 1;
-
-        rev_wire0_shift = static_cast<size_t>(1U) << rev_wire0;
-        rev_wire1_shift = static_cast<size_t>(1U) << rev_wire1;
-
-        rev_wire_min = std::min(rev_wire0, rev_wire1);
-        rev_wire_max = std::max(rev_wire0, rev_wire1);
-
-        parity_low = fillTrailingOnes(rev_wire_min);
-        parity_high = fillLeadingOnes(rev_wire_max + 1);
-        parity_middle =
-            fillLeadingOnes(rev_wire_min + 1) & fillTrailingOnes(rev_wire_max);
-
-        arr = arr_;
-        matrix = matrix_;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const std::size_t k, PrecisionT &expval) const {
-        const std::size_t i00 = ((k << 2U) & parity_high) |
-                                ((k << 1U) & parity_middle) | (k & parity_low);
-        const std::size_t i10 = i00 | rev_wire1_shift;
-        const std::size_t i01 = i00 | rev_wire0_shift;
-        const std::size_t i11 = i00 | rev_wire0_shift | rev_wire1_shift;
-
-        expval +=
-            real(conj(arr[i00]) *
-                     (matrix[0B0000] * arr[i00] + matrix[0B0001] * arr[i01] +
-                      matrix[0B0010] * arr[i10] + matrix[0B0011] * arr[i11]) +
-                 conj(arr[i01]) *
-                     (matrix[0B0100] * arr[i00] + matrix[0B0101] * arr[i01] +
-                      matrix[0B0110] * arr[i10] + matrix[0B0111] * arr[i11]) +
-                 conj(arr[i10]) *
-                     (matrix[0B1000] * arr[i00] + matrix[0B1001] * arr[i01] +
-                      matrix[0B1010] * arr[i10] + matrix[0B1011] * arr[i11]) +
-                 conj(arr[i11]) *
-                     (matrix[0B1100] * arr[i00] + matrix[0B1101] * arr[i01] +
-                      matrix[0B1110] * arr[i10] + matrix[0B1111] * arr[i11]));
-    }
-};
-
-template <class PrecisionT> struct getExpectationValueMultiQubitOpFunctor {
+template <class PrecisionT> struct getExpValMultiQubitOpFunctor {
     using ComplexT = Kokkos::complex<PrecisionT>;
     using KokkosComplexVector = Kokkos::View<ComplexT *>;
     using KokkosIntVector = Kokkos::View<std::size_t *>;
@@ -272,13 +177,18 @@ template <class PrecisionT> struct getExpectationValueMultiQubitOpFunctor {
     std::size_t dim;
     std::size_t num_qubits;
 
-    getExpectationValueMultiQubitOpFunctor(const KokkosComplexVector &arr_,
-                                           std::size_t num_qubits_,
-                                           const KokkosComplexVector &matrix_,
-                                           KokkosIntVector &wires_) {
+    getExpValMultiQubitOpFunctor(const KokkosComplexVector &arr_,
+                                 std::size_t num_qubits_,
+                                 const KokkosComplexVector &matrix_,
+                                 const std::vector<std::size_t> &wires_) {
+        Kokkos::View<const std::size_t *, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+            wires_host(wires_.data(), wires_.size());
+        Kokkos::resize(wires, wires_.size());
+        Kokkos::deep_copy(wires, wires_host);
+
         dim = 1U << wires_.size();
         num_qubits = num_qubits_;
-        wires = wires_;
         arr = arr_;
         matrix = matrix_;
     }
@@ -355,4 +265,370 @@ template <class PrecisionT> struct getExpectationValueSparseFunctor {
     }
 };
 
+template <class PrecisionT> struct getExpVal1QubitOpFunctor {
+    using ComplexT = Kokkos::complex<PrecisionT>;
+    using KokkosComplexVector = Kokkos::View<ComplexT *>;
+    using KokkosIntVector = Kokkos::View<std::size_t *>;
+
+    KokkosComplexVector arr;
+    KokkosComplexVector matrix;
+    const std::size_t n_wires = 1;
+    const std::size_t dim = 1U << n_wires;
+    std::size_t num_qubits;
+    std::size_t rev_wire;
+    std::size_t rev_wire_shift;
+    std::size_t wire_parity;
+    std::size_t wire_parity_inv;
+
+    getExpVal1QubitOpFunctor(
+        const KokkosComplexVector &arr_, const std::size_t num_qubits_,
+        const KokkosComplexVector &matrix_,
+        [[maybe_unused]] const std::vector<std::size_t> &wires_) {
+        arr = arr_;
+        matrix = matrix_;
+        num_qubits = num_qubits_;
+        rev_wire = num_qubits - wires_[0] - 1;
+        rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
+        wire_parity = fillTrailingOnes(rev_wire);
+        wire_parity_inv = fillLeadingOnes(rev_wire + 1);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const std::size_t k, PrecisionT &expval) const {
+        const std::size_t i0 =
+            ((k << 1U) & wire_parity_inv) | (wire_parity & k);
+        const std::size_t i1 = i0 | rev_wire_shift;
+
+        expval += real(conj(arr(i0)) *
+                       (matrix(0B00) * arr(i0) + matrix(0B01) * arr(i1)));
+        expval += real(conj(arr(i1)) *
+                       (matrix(0B10) * arr(i0) + matrix(0B11) * arr(i1)));
+    }
+};
+
+#define ENTRY2(xx, yy) xx << 2 | yy
+#define TERM2(xx, yy, iyy) matrix(ENTRY2(xx, yy)) * arr(iyy)
+#define EXPVAL2(ixx, xx)                                                       \
+    conj(arr(ixx)) * (TERM2(xx, 0B00, i00) + TERM2(xx, 0B01, i01) +            \
+                      TERM2(xx, 0B10, i10) + TERM2(xx, 0B11, i11))
+
+template <class PrecisionT> struct getExpVal2QubitOpFunctor {
+    using ComplexT = Kokkos::complex<PrecisionT>;
+    using KokkosComplexVector = Kokkos::View<ComplexT *>;
+    using KokkosIntVector = Kokkos::View<std::size_t *>;
+
+    KokkosComplexVector arr;
+    KokkosComplexVector matrix;
+    const std::size_t n_wires = 2;
+    const std::size_t dim = 1U << n_wires;
+    std::size_t num_qubits;
+    std::size_t rev_wire0;
+    std::size_t rev_wire1;
+    std::size_t rev_wire0_shift;
+    std::size_t rev_wire1_shift;
+    std::size_t rev_wire_min;
+    std::size_t rev_wire_max;
+    std::size_t parity_low;
+    std::size_t parity_high;
+    std::size_t parity_middle;
+
+    getExpVal2QubitOpFunctor(
+        const KokkosComplexVector &arr_, const std::size_t num_qubits_,
+        const KokkosComplexVector &matrix_,
+        [[maybe_unused]] const std::vector<std::size_t> &wires_) {
+        arr = arr_;
+        matrix = matrix_;
+        num_qubits = num_qubits_;
+
+        rev_wire0 = num_qubits - wires_[1] - 1;
+        rev_wire1 = num_qubits - wires_[0] - 1;
+        rev_wire0_shift = static_cast<size_t>(1U) << rev_wire0;
+        rev_wire1_shift = static_cast<size_t>(1U) << rev_wire1;
+        rev_wire_min = std::min(rev_wire0, rev_wire1);
+        rev_wire_max = std::max(rev_wire0, rev_wire1);
+        parity_low = fillTrailingOnes(rev_wire_min);
+        parity_high = fillLeadingOnes(rev_wire_max + 1);
+        parity_middle =
+            fillLeadingOnes(rev_wire_min + 1) & fillTrailingOnes(rev_wire_max);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const std::size_t k, PrecisionT &expval) const {
+        const std::size_t i00 = ((k << 2U) & parity_high) |
+                                ((k << 1U) & parity_middle) | (k & parity_low);
+        const std::size_t i10 = i00 | rev_wire1_shift;
+        const std::size_t i01 = i00 | rev_wire0_shift;
+        const std::size_t i11 = i00 | rev_wire0_shift | rev_wire1_shift;
+
+        expval += real(EXPVAL2(i00, 0B00));
+        expval += real(EXPVAL2(i10, 0B10));
+        expval += real(EXPVAL2(i01, 0B01));
+        expval += real(EXPVAL2(i11, 0B11));
+    }
+};
+
+#define ENTRY3(xx, yy) xx << 3 | yy
+#define TERM3(xx, yy, iyy) matrix(ENTRY3(xx, yy)) * arr(iyy)
+#define EXPVAL3(ixx, xx)                                                       \
+    conj(arr(ixx)) * (TERM3(xx, 0B000, i000) + TERM3(xx, 0B001, i001) +        \
+                      TERM3(xx, 0B010, i010) + TERM3(xx, 0B011, i011) +        \
+                      TERM3(xx, 0B100, i100) + TERM3(xx, 0B101, i101) +        \
+                      TERM3(xx, 0B110, i110) + TERM3(xx, 0B111, i111))
+#define INDEX(ivar, xx)                                                        \
+    std::size_t ivar = kdim | xx;                                              \
+    for (std::size_t pos = 0; pos < n_wires; pos++) {                          \
+        std::size_t x = ((ivar >> (n_wires - pos - 1)) ^                       \
+                         (ivar >> (num_qubits - wires(pos) - 1))) &            \
+                        1U;                                                    \
+        ivar = ivar ^ ((x << (n_wires - pos - 1)) |                            \
+                       (x << (num_qubits - wires(pos) - 1)));                  \
+    }
+
+template <class PrecisionT> struct getExpVal3QubitOpFunctor {
+    using ComplexT = Kokkos::complex<PrecisionT>;
+    using KokkosComplexVector = Kokkos::View<ComplexT *>;
+    using KokkosIntVector = Kokkos::View<std::size_t *>;
+
+    KokkosComplexVector arr;
+    KokkosComplexVector matrix;
+    KokkosIntVector wires;
+    const std::size_t n_wires = 3;
+    const std::size_t dim = 1U << n_wires;
+    std::size_t num_qubits;
+
+    getExpVal3QubitOpFunctor(const KokkosComplexVector &arr_,
+                             const std::size_t num_qubits_,
+                             const KokkosComplexVector &matrix_,
+                             const std::vector<std::size_t> &wires_) {
+        Kokkos::View<const std::size_t *, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+            wires_host(wires_.data(), wires_.size());
+        Kokkos::resize(wires, wires_.size());
+        Kokkos::deep_copy(wires, wires_host);
+
+        arr = arr_;
+        matrix = matrix_;
+        num_qubits = num_qubits_;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const std::size_t k, PrecisionT &expval) const {
+        const std::size_t kdim = k * dim;
+        INDEX(i000, 0B000);
+        INDEX(i001, 0B001);
+        INDEX(i010, 0B010);
+        INDEX(i011, 0B011);
+        INDEX(i100, 0B100);
+        INDEX(i101, 0B101);
+        INDEX(i110, 0B110);
+        INDEX(i111, 0B111);
+
+        expval += real(EXPVAL3(i000, 0B000));
+        expval += real(EXPVAL3(i001, 0B001));
+        expval += real(EXPVAL3(i010, 0B010));
+        expval += real(EXPVAL3(i011, 0B011));
+        expval += real(EXPVAL3(i100, 0B100));
+        expval += real(EXPVAL3(i101, 0B101));
+        expval += real(EXPVAL3(i110, 0B110));
+        expval += real(EXPVAL3(i111, 0B111));
+    }
+};
+
+#define ENTRY4(xx, yy) xx << 4 | yy
+#define TERM4(xx, yy, iyy) matrix(ENTRY4(xx, yy)) * arr(iyy)
+#define EXPVAL4(ixx, xx)                                                       \
+    conj(arr(ixx)) * (TERM4(xx, 0B0000, i0000) + TERM4(xx, 0B0001, i0001) +    \
+                      TERM4(xx, 0B0010, i0010) + TERM4(xx, 0B0011, i0011) +    \
+                      TERM4(xx, 0B0100, i0100) + TERM4(xx, 0B0101, i0101) +    \
+                      TERM4(xx, 0B0110, i0110) + TERM4(xx, 0B0111, i0111) +    \
+                      TERM4(xx, 0B1000, i1000) + TERM4(xx, 0B1001, i1001) +    \
+                      TERM4(xx, 0B1010, i1010) + TERM4(xx, 0B1011, i1011) +    \
+                      TERM4(xx, 0B1100, i1100) + TERM4(xx, 0B1101, i1101) +    \
+                      TERM4(xx, 0B1110, i1110) + TERM4(xx, 0B1111, i1111))
+
+template <class PrecisionT> struct getExpVal4QubitOpFunctor {
+    using ComplexT = Kokkos::complex<PrecisionT>;
+    using KokkosComplexVector = Kokkos::View<ComplexT *>;
+    using KokkosIntVector = Kokkos::View<std::size_t *>;
+
+    KokkosComplexVector arr;
+    KokkosComplexVector matrix;
+    KokkosIntVector wires;
+    const std::size_t n_wires = 4;
+    const std::size_t dim = 1U << n_wires;
+    std::size_t num_qubits;
+
+    getExpVal4QubitOpFunctor(const KokkosComplexVector &arr_,
+                             const std::size_t num_qubits_,
+                             const KokkosComplexVector &matrix_,
+                             const std::vector<std::size_t> &wires_) {
+        Kokkos::View<const std::size_t *, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+            wires_host(wires_.data(), wires_.size());
+        Kokkos::resize(wires, wires_.size());
+        Kokkos::deep_copy(wires, wires_host);
+        arr = arr_;
+        matrix = matrix_;
+        num_qubits = num_qubits_;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const std::size_t k, PrecisionT &expval) const {
+        const std::size_t kdim = k * dim;
+
+        INDEX(i0000, 0B0000);
+        INDEX(i0001, 0B0001);
+        INDEX(i0010, 0B0010);
+        INDEX(i0011, 0B0011);
+        INDEX(i0100, 0B0100);
+        INDEX(i0101, 0B0101);
+        INDEX(i0110, 0B0110);
+        INDEX(i0111, 0B0111);
+        INDEX(i1000, 0B1000);
+        INDEX(i1001, 0B1001);
+        INDEX(i1010, 0B1010);
+        INDEX(i1011, 0B1011);
+        INDEX(i1100, 0B1100);
+        INDEX(i1101, 0B1101);
+        INDEX(i1110, 0B1110);
+        INDEX(i1111, 0B1111);
+
+        expval += real(EXPVAL4(i0000, 0B0000));
+        expval += real(EXPVAL4(i0001, 0B0001));
+        expval += real(EXPVAL4(i0010, 0B0010));
+        expval += real(EXPVAL4(i0011, 0B0011));
+        expval += real(EXPVAL4(i0100, 0B0100));
+        expval += real(EXPVAL4(i0101, 0B0101));
+        expval += real(EXPVAL4(i0110, 0B0110));
+        expval += real(EXPVAL4(i0111, 0B0111));
+        expval += real(EXPVAL4(i1000, 0B1000));
+        expval += real(EXPVAL4(i1001, 0B1001));
+        expval += real(EXPVAL4(i1010, 0B1010));
+        expval += real(EXPVAL4(i1011, 0B1011));
+        expval += real(EXPVAL4(i1100, 0B1100));
+        expval += real(EXPVAL4(i1101, 0B1101));
+        expval += real(EXPVAL4(i1110, 0B1110));
+        expval += real(EXPVAL4(i1111, 0B1111));
+    }
+};
+
+#define ENTRY5(xx, yy) xx << 5 | yy
+#define TERM5(xx, yy, iyy) matrix(ENTRY5(xx, yy)) * arr(iyy)
+#define EXPVAL5(ixx, xx)                                                       \
+    conj(arr(ixx)) *                                                           \
+        (TERM5(xx, 0B00000, i00000) + TERM5(xx, 0B00001, i00001) +             \
+         TERM5(xx, 0B00010, i00010) + TERM5(xx, 0B00011, i00011) +             \
+         TERM5(xx, 0B00100, i00100) + TERM5(xx, 0B00101, i00101) +             \
+         TERM5(xx, 0B00110, i00110) + TERM5(xx, 0B00111, i00111) +             \
+         TERM5(xx, 0B01000, i01000) + TERM5(xx, 0B01001, i01001) +             \
+         TERM5(xx, 0B01010, i01010) + TERM5(xx, 0B01011, i01011) +             \
+         TERM5(xx, 0B01100, i01100) + TERM5(xx, 0B01101, i01101) +             \
+         TERM5(xx, 0B01110, i01110) + TERM5(xx, 0B01111, i01111) +             \
+         TERM5(xx, 0B10000, i10000) + TERM5(xx, 0B10001, i10001) +             \
+         TERM5(xx, 0B10010, i10010) + TERM5(xx, 0B10011, i10011) +             \
+         TERM5(xx, 0B10100, i10100) + TERM5(xx, 0B10101, i10101) +             \
+         TERM5(xx, 0B10110, i10110) + TERM5(xx, 0B10111, i10111) +             \
+         TERM5(xx, 0B11000, i11000) + TERM5(xx, 0B11001, i11001) +             \
+         TERM5(xx, 0B11010, i11010) + TERM5(xx, 0B11011, i11011) +             \
+         TERM5(xx, 0B11100, i11100) + TERM5(xx, 0B11101, i11101) +             \
+         TERM5(xx, 0B11110, i11110) + TERM5(xx, 0B11111, i11111))
+
+template <class PrecisionT> struct getExpVal5QubitOpFunctor {
+    using ComplexT = Kokkos::complex<PrecisionT>;
+    using KokkosComplexVector = Kokkos::View<ComplexT *>;
+    using KokkosIntVector = Kokkos::View<std::size_t *>;
+
+    KokkosComplexVector arr;
+    KokkosComplexVector matrix;
+    KokkosIntVector wires;
+    const std::size_t n_wires = 5;
+    const std::size_t dim = 1U << n_wires;
+    std::size_t num_qubits;
+
+    getExpVal5QubitOpFunctor(const KokkosComplexVector &arr_,
+                             const std::size_t num_qubits_,
+                             const KokkosComplexVector &matrix_,
+                             const std::vector<std::size_t> &wires_) {
+        Kokkos::View<const std::size_t *, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+            wires_host(wires_.data(), wires_.size());
+        Kokkos::resize(wires, wires_.size());
+        Kokkos::deep_copy(wires, wires_host);
+        arr = arr_;
+        matrix = matrix_;
+        num_qubits = num_qubits_;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const std::size_t k, PrecisionT &expval) const {
+        const std::size_t kdim = k * dim;
+
+        INDEX(i00000, 0B00000);
+        INDEX(i00001, 0B00001);
+        INDEX(i00010, 0B00010);
+        INDEX(i00011, 0B00011);
+        INDEX(i00100, 0B00100);
+        INDEX(i00101, 0B00101);
+        INDEX(i00110, 0B00110);
+        INDEX(i00111, 0B00111);
+        INDEX(i01000, 0B01000);
+        INDEX(i01001, 0B01001);
+        INDEX(i01010, 0B01010);
+        INDEX(i01011, 0B01011);
+        INDEX(i01100, 0B01100);
+        INDEX(i01101, 0B01101);
+        INDEX(i01110, 0B01110);
+        INDEX(i01111, 0B01111);
+        INDEX(i10000, 0B10000);
+        INDEX(i10001, 0B10001);
+        INDEX(i10010, 0B10010);
+        INDEX(i10011, 0B10011);
+        INDEX(i10100, 0B10100);
+        INDEX(i10101, 0B10101);
+        INDEX(i10110, 0B10110);
+        INDEX(i10111, 0B10111);
+        INDEX(i11000, 0B11000);
+        INDEX(i11001, 0B11001);
+        INDEX(i11010, 0B11010);
+        INDEX(i11011, 0B11011);
+        INDEX(i11100, 0B11100);
+        INDEX(i11101, 0B11101);
+        INDEX(i11110, 0B11110);
+        INDEX(i11111, 0B11111);
+
+        expval += real(EXPVAL5(i00000, 0B00000));
+        expval += real(EXPVAL5(i00001, 0B00001));
+        expval += real(EXPVAL5(i00010, 0B00010));
+        expval += real(EXPVAL5(i00011, 0B00011));
+        expval += real(EXPVAL5(i00100, 0B00100));
+        expval += real(EXPVAL5(i00101, 0B00101));
+        expval += real(EXPVAL5(i00110, 0B00110));
+        expval += real(EXPVAL5(i00111, 0B00111));
+        expval += real(EXPVAL5(i01000, 0B01000));
+        expval += real(EXPVAL5(i01001, 0B01001));
+        expval += real(EXPVAL5(i01010, 0B01010));
+        expval += real(EXPVAL5(i01011, 0B01011));
+        expval += real(EXPVAL5(i01100, 0B01100));
+        expval += real(EXPVAL5(i01101, 0B01101));
+        expval += real(EXPVAL5(i01110, 0B01110));
+        expval += real(EXPVAL5(i01111, 0B01111));
+        expval += real(EXPVAL5(i10000, 0B10000));
+        expval += real(EXPVAL5(i10001, 0B10001));
+        expval += real(EXPVAL5(i10010, 0B10010));
+        expval += real(EXPVAL5(i10011, 0B10011));
+        expval += real(EXPVAL5(i10100, 0B10100));
+        expval += real(EXPVAL5(i10101, 0B10101));
+        expval += real(EXPVAL5(i10110, 0B10110));
+        expval += real(EXPVAL5(i10111, 0B10111));
+        expval += real(EXPVAL5(i11000, 0B11000));
+        expval += real(EXPVAL5(i11001, 0B11001));
+        expval += real(EXPVAL5(i11010, 0B11010));
+        expval += real(EXPVAL5(i11011, 0B11011));
+        expval += real(EXPVAL5(i11100, 0B11100));
+        expval += real(EXPVAL5(i11101, 0B11101));
+        expval += real(EXPVAL5(i11110, 0B11110));
+        expval += real(EXPVAL5(i11111, 0B11111));
+    }
+};
+
 } // namespace Pennylane::LightningKokkos::Functors
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp
index e78b590561..dc9e92a342 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp
@@ -21,6 +21,7 @@
 #include "MeasurementsBase.hpp"
 #include "MeasuresFunctors.hpp"
 #include "Observables.hpp"
+#include "ObservablesKokkos.hpp"
 #include "StateVectorKokkos.hpp"
 #include "Util.hpp"
 
@@ -29,6 +30,7 @@ namespace {
 using namespace Pennylane::Measures;
 using namespace Pennylane::Observables;
 using Pennylane::LightningKokkos::StateVectorKokkos;
+using Pennylane::LightningKokkos::Observables::HermitianObs;
 using Pennylane::LightningKokkos::Util::getRealOfComplexInnerProduct;
 using Pennylane::LightningKokkos::Util::SparseMV_Kokkos;
 using Pennylane::Util::exp2;
@@ -128,18 +130,49 @@ class Measurements final
      */
     auto getExpValMatrix(const KokkosVector &matrix,
                          const std::vector<std::size_t> &wires) -> PrecisionT {
-        if (wires.size() == 1) {
-            return applyExpValFunctor<getExpectationValueSingleQubitOpFunctor,
-                                      1>(matrix, wires);
-        } else if (wires.size() == 2) {
-            return applyExpValFunctor<getExpectationValueTwoQubitOpFunctor, 2>(
-                matrix, wires);
-        } else {
-            StateVectorT ob_sv{this->_statevector};
-            ob_sv.applyMultiQubitOp(matrix, wires);
-            return getRealOfComplexInnerProduct(this->_statevector.getView(),
-                                                ob_sv.getView());
+        std::size_t num_qubits = this->_statevector.getNumQubits();
+        std::size_t two2N = std::exp2(num_qubits - wires.size());
+        std::size_t dim = std::exp2(wires.size());
+        const KokkosVector arr_data = this->_statevector.getView();
+
+        PrecisionT expval = 0.0;
+        switch (wires.size()) {
+        case 1:
+            Kokkos::parallel_reduce(two2N,
+                                    getExpVal1QubitOpFunctor<PrecisionT>(
+                                        arr_data, num_qubits, matrix, wires),
+                                    expval);
+            break;
+        case 2:
+            Kokkos::parallel_reduce(two2N,
+                                    getExpVal2QubitOpFunctor<PrecisionT>(
+                                        arr_data, num_qubits, matrix, wires),
+                                    expval);
+            break;
+        case 3:
+            Kokkos::parallel_reduce(two2N,
+                                    getExpVal3QubitOpFunctor<PrecisionT>(
+                                        arr_data, num_qubits, matrix, wires),
+                                    expval);
+            break;
+        case 4:
+            Kokkos::parallel_reduce(two2N,
+                                    getExpVal4QubitOpFunctor<PrecisionT>(
+                                        arr_data, num_qubits, matrix, wires),
+                                    expval);
+            break;
+        default:
+            std::size_t scratch_size = ScratchViewComplex::shmem_size(dim);
+            Kokkos::parallel_reduce(
+                "getExpValMultiQubitOpFunctor",
+                TeamPolicy(two2N, Kokkos::AUTO, dim)
+                    .set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
+                getExpValMultiQubitOpFunctor<PrecisionT>(arr_data, num_qubits,
+                                                         matrix, wires),
+                expval);
+            break;
         }
+        return expval;
     }
 
     /**
@@ -155,6 +188,16 @@ class Measurements final
                                             ob_sv.getView());
     }
 
+    /**
+     * @brief Calculate expectation value for a HermitianObs.
+     *
+     * @param ob HermitianObs.
+     * @return Expectation value with respect to the given observable.
+     */
+    PrecisionT expval(const HermitianObs<StateVectorT> &ob) {
+        return expval(ob.getMatrix(), ob.getWires());
+    }
+
     /**
      * @brief Expected value of an observable.
      *
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/tests/Test_StateVectorKokkos_Expval.cpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/tests/Test_StateVectorKokkos_Expval.cpp
index 7eec6715b3..63204a5109 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/tests/Test_StateVectorKokkos_Expval.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/tests/Test_StateVectorKokkos_Expval.cpp
@@ -15,6 +15,7 @@
 #include <complex>
 #include <cstddef>
 #include <limits>
+#include <random>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -38,6 +39,7 @@ using namespace Pennylane::LightningKokkos::Observables;
 using Pennylane::Util::createNonTrivialState;
 using Pennylane::Util::write_CSR_vectors;
 using std::size_t;
+std::mt19937_64 re{1337};
 } // namespace
 /// @endcond
 
@@ -379,6 +381,158 @@ TEMPLATE_TEST_CASE("Test expectation value of TensorProdObs",
     }
 }
 
+TEMPLATE_TEST_CASE("Test expectation value of NQubit Hermitian",
+                   "[StateVectorKokkos_Expval]", float, double) {
+    using ComplexT = StateVectorKokkos<TestType>::ComplexT;
+    using VectorT = TestVector<std::complex<TestType>>;
+    const size_t num_qubits = 7;
+    VectorT sv_data = createRandomStateVectorData<TestType>(re, num_qubits);
+
+    StateVectorKokkos<TestType> kokkos_sv(
+        reinterpret_cast<ComplexT *>(sv_data.data()), sv_data.size());
+    auto m = Measurements(kokkos_sv);
+
+    auto X0 = std::make_shared<NamedObs<StateVectorKokkos<TestType>>>(
+        "PauliX", std::vector<size_t>{0});
+    auto Y1 = std::make_shared<NamedObs<StateVectorKokkos<TestType>>>(
+        "PauliY", std::vector<size_t>{1});
+    auto Z2 = std::make_shared<NamedObs<StateVectorKokkos<TestType>>>(
+        "PauliZ", std::vector<size_t>{2});
+    auto X3 = std::make_shared<NamedObs<StateVectorKokkos<TestType>>>(
+        "PauliX", std::vector<size_t>{3});
+    auto Y4 = std::make_shared<NamedObs<StateVectorKokkos<TestType>>>(
+        "PauliY", std::vector<size_t>{4});
+
+    ComplexT j{0.0, 1.0};
+    ComplexT u{1.0, 0.0};
+    ComplexT z{0.0, 0.0};
+
+    SECTION("3Qubit") {
+        auto ob =
+            TensorProdObs<StateVectorKokkos<TestType>>::create({X0, Y1, Z2});
+        auto expected = m.expval(*ob);
+        std::vector<ComplexT> matrix{z, z, z, z,  z, z,  -j, z, z,  z, z, z, z,
+                                     z, z, j, z,  z, z,  z,  j, z,  z, z, z, z,
+                                     z, z, z, -j, z, z,  z,  z, -j, z, z, z, z,
+                                     z, z, z, z,  j, z,  z,  z, z,  j, z, z, z,
+                                     z, z, z, z,  z, -j, z,  z, z,  z, z, z};
+        SECTION("Hermitian") {
+            auto hermitian =
+                HermitianObs<StateVectorKokkos<TestType>>(matrix, {0, 1, 2});
+            auto res = m.expval(hermitian);
+
+            CHECK(expected == Approx(res));
+        }
+        SECTION("Matrix") {
+            auto res = m.expval(matrix, {0, 1, 2});
+            CHECK(expected == Approx(res));
+        }
+    }
+
+    SECTION("4Qubit") {
+        auto ob = TensorProdObs<StateVectorKokkos<TestType>>::create(
+            {X0, Y1, Z2, X3});
+        auto expected = m.expval(*ob);
+        std::vector<ComplexT> matrix{
+            z, z, z,  z, z, z, z, z,  z,  z, z, z, z, -j, z, z, z, z, z, z,
+            z, z, z,  z, z, z, z, z,  -j, z, z, z, z, z,  z, z, z, z, z, z,
+            z, z, z,  z, z, z, z, j,  z,  z, z, z, z, z,  z, z, z, z, z, z,
+            z, z, j,  z, z, z, z, z,  z,  z, z, z, z, j,  z, z, z, z, z, z,
+            z, z, z,  z, z, z, z, z,  j,  z, z, z, z, z,  z, z, z, z, z, z,
+            z, z, z,  z, z, z, z, -j, z,  z, z, z, z, z,  z, z, z, z, z, z,
+            z, z, -j, z, z, z, z, z,  z,  z, z, z, z, -j, z, z, z, z, z, z,
+            z, z, z,  z, z, z, z, z,  -j, z, z, z, z, z,  z, z, z, z, z, z,
+            z, z, z,  z, z, z, z, j,  z,  z, z, z, z, z,  z, z, z, z, z, z,
+            z, z, j,  z, z, z, z, z,  z,  z, z, z, z, j,  z, z, z, z, z, z,
+            z, z, z,  z, z, z, z, z,  j,  z, z, z, z, z,  z, z, z, z, z, z,
+            z, z, z,  z, z, z, z, -j, z,  z, z, z, z, z,  z, z, z, z, z, z,
+            z, z, -j, z, z, z, z, z,  z,  z, z, z, z, z,  z, z};
+        SECTION("Hermitian") {
+            auto hermitian =
+                HermitianObs<StateVectorKokkos<TestType>>(matrix, {0, 1, 2, 3});
+            auto res = m.expval(hermitian);
+
+            CHECK(expected == Approx(res));
+        }
+        SECTION("Matrix") {
+            auto res = m.expval(matrix, {0, 1, 2, 3});
+            CHECK(expected == Approx(res));
+        }
+    }
+
+    SECTION("5Qubit") {
+        auto ob = TensorProdObs<StateVectorKokkos<TestType>>::create(
+            {X0, Y1, Z2, X3, Y4});
+        auto expected = m.expval(*ob);
+        std::vector<ComplexT> matrix{
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  -u, z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  u, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, -u, z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  u, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, u,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            -u, z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, u, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  -u, z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  u,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, -u, z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  u, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, -u, z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  -u, z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  u, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, -u, z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  u, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  -u, z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  u, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, -u, z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  u,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  u,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, -u, z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  u, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, -u, z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  u, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, -u, z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  u, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  -u, z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  -u, z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  u, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, -u, z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  u,  z,  z,  z, z, z,  z, z,  z, z, z,
+            z,  z, z,  z, z,  z,  z, z,  z,  z,  z,  z, z, z,  z, z,  z};
+        SECTION("Hermitian") {
+            auto hermitian = HermitianObs<StateVectorKokkos<TestType>>(
+                matrix, {0, 1, 2, 3, 4});
+            auto res = m.expval(hermitian);
+
+            CHECK(expected == Approx(res));
+        }
+        SECTION("Matrix") {
+            auto res = m.expval(matrix, {0, 1, 2, 3, 4});
+            CHECK(expected == Approx(res));
+        }
+    }
+}
+
 TEMPLATE_TEST_CASE("StateVectorKokkos::Hamiltonian_expval_Sparse",
                    "[StateVectorKokkos_Expval]", float, double) {
     using ComplexT = StateVectorKokkos<TestType>::ComplexT;
diff --git a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
index 671a5d8611..56cda8f92c 100644
--- a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
+++ b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
@@ -205,7 +205,7 @@ def __init__(
             else:
                 type0 = type(InitializationSettings())
                 raise TypeError(
-                    f"Argument kokkos_args must be of {type0} but it is of {type(kokkos_args)}."
+                    f"Argument kokkos_args must be of type {type0} but it is of {type(kokkos_args)}."
                 )
             self._sync = sync
 
diff --git a/tests/test_device.py b/tests/test_device.py
index 001e502009..a45f269fb1 100644
--- a/tests/test_device.py
+++ b/tests/test_device.py
@@ -39,3 +39,12 @@ def test_create_device_with_dtype(C):
 def test_create_device_with_unsupported_dtype():
     with pytest.raises(TypeError, match="Unsupported complex Type:"):
         dev = qml.device(device_name, wires=1, c_dtype=np.complex256)
+
+
+@pytest.mark.skipif(
+    device_name != "lightning.kokkos" or not ld._CPP_BINARY_AVAILABLE,
+    reason="Only lightning.kokkos has a kwarg kokkos_args.",
+)
+def test_create_device_with_unsupported_kokkos_args():
+    with pytest.raises(TypeError, match="Argument kokkos_args must be of type"):
+        dev = qml.device(device_name, wires=1, kokkos_args=np.complex256)
diff --git a/tests/test_expval.py b/tests/test_expval.py
index 69584187c0..9041eb363a 100644
--- a/tests/test_expval.py
+++ b/tests/test_expval.py
@@ -101,17 +101,21 @@ def test_hadamard_expectation(self, theta, phi, qubit_device, tol):
         ) / np.sqrt(2)
         assert np.allclose(res, expected, tol)
 
-    @pytest.mark.parametrize("n_wires", range(1, 5))
+    @pytest.mark.parametrize("n_wires", range(1, 7))
     def test_hermitian_expectation(self, n_wires, theta, phi, qubit_device, tol):
         """Test that Hadamard expectation value is correct"""
-        dev_def = qml.device("default.qubit", wires=6)
-        dev = qubit_device(wires=6)
+        n_qubits = 7
+        dev_def = qml.device("default.qubit", wires=n_qubits)
+        dev = qubit_device(wires=n_qubits)
         m = 2**n_wires
         U = np.random.rand(m, m) + 1j * np.random.rand(m, m)
         U = U + np.conj(U.T)
         obs = qml.Hermitian(U, wires=range(n_wires))
+        init_state = np.random.rand(2**n_qubits) + 1j * np.random.rand(2**n_qubits)
+        init_state /= np.sqrt(np.dot(np.conj(init_state), init_state))
 
         def circuit():
+            qml.StatePrep(init_state, wires=range(n_qubits))
             qml.RY(theta, wires=[0])
             qml.RY(phi, wires=[1])
             qml.CNOT(wires=[0, 1])
diff --git a/tests/test_gates.py b/tests/test_gates.py
index bcdfca6306..d3d33b8ade 100644
--- a/tests/test_gates.py
+++ b/tests/test_gates.py
@@ -16,6 +16,7 @@
 """
 import pytest
 from conftest import LightningDevice, device_name
+from conftest import THETA, PHI
 
 import itertools
 import numpy as np
@@ -236,3 +237,29 @@ def test_get_diagonalizing_gates(obs, has_rotation):
             assert qml.equal(rot_actual, rot_expected)
     else:
         assert len(actual) == 0
+
+
+@pytest.mark.parametrize("theta,phi", list(zip(THETA, PHI)))
+@pytest.mark.parametrize("n_wires", range(1, 7))
+def test_qubit_unitary(n_wires, theta, phi, tol):
+    """Test that Hadamard expectation value is correct"""
+    n_qubits = 10
+    dev_def = qml.device("default.qubit", wires=n_qubits)
+    dev = qml.device(device_name, wires=n_qubits)
+    m = 2**n_wires
+    U = np.random.rand(m, m) + 1j * np.random.rand(m, m)
+    U, _ = np.linalg.qr(U)
+    init_state = np.random.rand(2**n_qubits) + 1j * np.random.rand(2**n_qubits)
+    init_state /= np.sqrt(np.dot(np.conj(init_state), init_state))
+
+    def circuit():
+        qml.StatePrep(init_state, wires=range(n_qubits))
+        qml.RY(theta, wires=[0])
+        qml.RY(phi, wires=[1])
+        qml.CNOT(wires=[0, 1])
+        qml.QubitUnitary(U, wires=range(2, 2 + n_wires))
+        return qml.state()
+
+    circ = qml.QNode(circuit, dev)
+    circ_def = qml.QNode(circuit, dev_def)
+    assert np.allclose(circ(), circ_def(), tol)