PennyLaneAI · mlxd · Jan 12, 2024 · Sep 26, 2023 · Sep 26, 2023 · Sep 27, 2023
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -30,14 +30,17 @@
 
 ### Improvements
 
+* OpenMP acceleration can now be enabled at compile time for all `lightning.qubit` gate kernels.
+  [(#510)](https://github.com/PennyLaneAI/pennylane-lightning/pull/510)
+
 * Update setup.py to allow for multi-package co-existence. The PennyLane_Lightning package now is the responsible for the core functionality, and will be depended upon by all other extensions.
-  [(#504)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/504)
+  [(#504)](https://github.com/PennyLaneAI/pennylane-lightning/pull/504)
 
 * Refactor LKokkos `StateVectorKokkos` class to use Kokkos `RangePolicy` together with special functors in `applyMultiQubitOp` to apply 1- to 4-wire generic unitary gates. For more than 4 wires, the general implementation using Kokkos `TeamPolicy` is employed to yield the best all-around performance.
-  [(#490)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/490)
+  [(#490)](https://github.com/PennyLaneAI/pennylane-lightning/pull/490)
 
 * Refactor LKokkos `Measurements` class to use Kokkos `RangePolicy` together with special functors to obtain the expectation value of 1- to 4-wire generic unitary gates. For more than 4 wires, the general implementation using Kokkos `TeamPolicy` is employed to yield the best all-around performance.
-  [(#489)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/489)
+  [(#489)](https://github.com/PennyLaneAI/pennylane-lightning/pull/489)
 
 * Add tests to increase LKokkos coverage.
   [(#485)](https://github.com/PennyLaneAI/pennylane-lightning/pull/485)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
@@ -20,6 +20,7 @@ add_library(lightning_qubit STATIC ${LQUBIT_FILES})
 
 option(ENABLE_BLAS "Enable BLAS" OFF)
 option(ENABLE_GATE_DISPATCHER "Enable gate kernel dispatching on AVX/AVX2/AVX512" ON)
+option(LQ_ENABLE_KERNEL_OMP "Enable OpenMP pragmas for gate kernels" OFF)
 
 # Inform the compiler that this device is enabled.
 target_compile_options(lightning_compile_options INTERFACE "-D_ENABLE_PLQUBIT=1")
@@ -46,6 +47,10 @@ else()
     message(STATUS "ENABLE_BLAS is OFF.")
 endif()
 
+if(LQ_ENABLE_KERNEL_OMP)
+    add_definitions("-DPL_LQ_KERNEL_OMP")
+endif()
+
 target_link_libraries(lightning_qubit PUBLIC    lightning_compile_options
                                                 lightning_external_libs
                                                 lightning_base

diff --git a/...ng/core/src/simulators/lightning_qubit/gates/cpu_kernels/GateImplementationsAVXCommon.hpp b/...ng/core/src/simulators/lightning_qubit/gates/cpu_kernels/GateImplementationsAVXCommon.hpp
@@ -23,6 +23,7 @@
 #include "Error.hpp"
 #include "GateImplementationsLM.hpp"
 #include "GateOperation.hpp"
+#include "GatePragmas.hpp"
 #include "Gates.hpp"
 #include "KernelType.hpp"
 #include "LinearAlgebra.hpp"

diff --git a/...lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/GateImplementationsLM.hpp b/...lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/GateImplementationsLM.hpp
diff --git a/...lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/GateImplementationsPI.hpp b/...lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/GateImplementationsPI.hpp
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/GatePragmas.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/GatePragmas.hpp
@@ -0,0 +1,34 @@
+// Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file GatePragmas.hpp
+ * Defines macros for enabling various OpenMP options in the gate kernel
+ * definitions.
+ */
+#pragma once
+
+namespace Pennylane::LightningQubit::Gates::Pragmas {
+
+// Defines utility macros to annotate gate-kernel loops with OpenMP parallel-for
+// and OpenMP SIMD pragmas. Selectable at compile time.
+#if defined PL_LQ_KERNEL_OMP && defined _OPENMP
+#define PRAGMA_WRAP(S) _Pragma(#S)
+#define PL_LOOP_PARALLEL(x) PRAGMA_WRAP(omp parallel for collapse(x))
+#define PL_LOOP_SIMD PRAGMA_WRAP(omp simd)
+#else
+#define PL_LOOP_PARALLEL(N)
+#define PL_LOOP_SIMD
+#endif
+
+}; // namespace Pennylane::LightningQubit::Gates::Pragmas
diff --git a/...tning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVXGateKernels.hpp b/...tning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVXGateKernels.hpp
@@ -16,6 +16,7 @@
  * Include all AVX gate implementations
  */
 #pragma once
+#include "../GatePragmas.hpp"
 #include "ApplyCNOT.hpp"
 #include "ApplyCRX.hpp"
 #include "ApplyCRY.hpp"

diff --git a/...ne_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVXUtil.hpp b/...ne_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVXUtil.hpp
@@ -305,6 +305,7 @@ constexpr __m512i setr512i(int64_t  e0, int64_t  e1, int64_t  e2, int64_t  e3,
 template <typename PrecisionT, size_t packed_size, typename Func>
 auto toParity(Func &&func) -> AVXIntrinsicType<PrecisionT, packed_size> {
     std::array<PrecisionT, packed_size> data{};
+    PL_LOOP_SIMD
     for (size_t idx = 0; idx < packed_size / 2; idx++) {
         data[2 * idx + 0] = static_cast<PrecisionT>(1.0) -
                             2 * static_cast<PrecisionT>(func(idx));
@@ -323,6 +324,7 @@ auto toParity(Func &&func) -> AVXIntrinsicType<PrecisionT, packed_size> {
 template <typename PrecisionT, size_t packed_size, typename Func>
 auto setValueOneTwo(Func &&func) -> AVXIntrinsicType<PrecisionT, packed_size> {
     std::array<PrecisionT, packed_size> data{};
+    PL_LOOP_SIMD
     for (size_t idx = 0; idx < packed_size / 2; idx++) {
         data[2 * idx + 0] = static_cast<PrecisionT>(func(idx));
         data[2 * idx + 1] = data[2 * idx + 0];

diff --git a/..._lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/ApplyCNOT.hpp b/..._lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/ApplyCNOT.hpp
@@ -28,6 +28,10 @@
 #include <complex>
 #include <utility>
 
+namespace {
+using namespace Pennylane::LightningQubit::Gates::Pragmas;
+}
+
 namespace Pennylane::LightningQubit::Gates::AVXCommon {
 template <typename PrecisionT, size_t packed_size> struct ApplyCNOT {
     using Precision = PrecisionT;
@@ -58,7 +62,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCNOT {
                                       [[maybe_unused]] bool inverse) {
         constexpr static auto perm =
             applyInternalInternalPermutation<control, target>();
-
+        PL_LOOP_PARALLEL(1)
         for (size_t n = 0; n < exp2(num_qubits); n += packed_size / 2) {
             const auto v = PrecisionAVXConcept::load(arr + n);
             PrecisionAVXConcept::store(arr + n, Permutation::permute<perm>(v));
@@ -99,7 +103,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCNOT {
         const size_t max_wire_parity_inv = fillLeadingOnes(rev_wire_max + 1);
 
         constexpr static auto mask = applyInternalExternalMask<control>();
-
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 1); k += packed_size / 2) {
             const size_t i0 =
                 ((k << 1U) & max_wire_parity_inv) | (max_wire_parity & k);
@@ -137,7 +141,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCNOT {
         const size_t max_wire_parity_inv = fillLeadingOnes(control + 1);
 
         constexpr static auto perm = applyExternalInternalPermutation<target>();
-
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 1); k += packed_size / 2) {
             const size_t i0 =
                 ((k << 1U) & max_wire_parity_inv) | (max_wire_parity & k);
@@ -163,7 +167,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCNOT {
         const size_t parity_high = fillLeadingOnes(rev_wire_max + 1);
         const size_t parity_middle =
             fillLeadingOnes(rev_wire_min + 1) & fillTrailingOnes(rev_wire_max);
-
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 2); k += packed_size / 2) {
             const size_t i00 = ((k << 2U) & parity_high) |
                                ((k << 1U) & parity_middle) | (k & parity_low);

diff --git a/...e_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/ApplyCRX.hpp b/...e_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/ApplyCRX.hpp
@@ -64,7 +64,6 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
     template <size_t control, size_t target>
     static consteval auto applyInternalInternalPermutation() {
         std::array<uint8_t, packed_size> perm{};
-
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) { // if control bit is 1
                 perm[2 * k + 0] = 2 * (k ^ (1U << target)) + 1;
@@ -85,7 +84,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
     static auto applyInternalInternalOffDiagFactor(ParamT angle) {
         std::array<PrecisionT, packed_size> arr{};
 
-        // positions are after permutations
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) { // if control bit is 1
                 arr[2 * k + 0] = std::sin(angle / 2);
@@ -105,8 +104,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
     template <size_t control, size_t target, class ParamT>
     static auto applyInternalInternalDiagFactor(ParamT angle) {
         std::array<PrecisionT, packed_size> arr{};
-
-        // positions are after permutations
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) { // if control bit is 1
                 arr[2 * k + 0] = std::cos(angle / 2);
@@ -134,7 +132,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
             applyInternalInternalOffDiagFactor<control, target>(angle);
         const auto diag_factor =
             applyInternalInternalDiagFactor<control, target>(angle);
-
+        PL_LOOP_PARALLEL(1)
         for (size_t n = 0; n < exp2(num_qubits); n += packed_size / 2) {
             const auto v = PrecisionAVXConcept::load(arr + n);
             const auto diag_w = diag_factor * v;
@@ -150,7 +148,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
     template <size_t control, typename ParamT>
     static auto applyInternalExternalDiagFactor(ParamT angle) {
         std::array<Precision, packed_size> arr{};
-
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) {
                 // if control is 1
@@ -170,7 +168,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
     template <size_t control, typename ParamT>
     static auto applyInternalExternalOffDiagFactor(ParamT angle) {
         std::array<Precision, packed_size> arr{};
-
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) {
                 // if control is 1
@@ -212,7 +210,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
 
         constexpr static auto perm = compilePermutation<PrecisionT>(
             swapRealImag(identity<packed_size>()));
-
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 1); k += packed_size / 2) {
             const size_t i0 =
                 ((k << 1U) & target_wire_parity_inv) | (target_wire_parity & k);
@@ -238,7 +236,6 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
         std::array<uint8_t, packed_size> arr{};
 
         uint8_t s = (uint8_t{1U} << target);
-
         for (size_t k = 0; k < packed_size / 2; k++) {
             arr[2 * k + 0] = 2 * (k ^ s) + 1;
             arr[2 * k + 1] = 2 * (k ^ s) + 0;
@@ -266,7 +263,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
             set1<PrecisionT, packed_size>(std::cos(angle / 2));
         const auto offdiag_factor =
             imagFactor<PrecisionT, packed_size>(-std::sin(angle / 2));
-
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 1); k += packed_size / 2) {
             const size_t i0 =
                 ((k << 1U) & max_wire_parity_inv) | (max_wire_parity & k);
@@ -308,7 +305,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRX {
 
         constexpr static auto perm = compilePermutation<PrecisionT>(
             swapRealImag(identity<packed_size>()));
-
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 2); k += packed_size / 2) {
             const size_t i00 = ((k << 2U) & parity_high) |
                                ((k << 1U) & parity_middle) | (k & parity_low);

diff --git a/...e_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/ApplyCRY.hpp b/...e_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/ApplyCRY.hpp
@@ -61,7 +61,6 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
     template <size_t control, size_t target>
     static consteval auto applyInternalInternalPermutation() {
         std::array<uint8_t, packed_size> perm{};
-
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) { // if control bit is 1
                 perm[2 * k + 0] = 2 * (k ^ (1U << target)) + 0;
@@ -81,7 +80,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
     template <size_t control, size_t target, class ParamT>
     static auto applyInternalInternalOffDiagFactor(ParamT angle) {
         std::array<PrecisionT, packed_size> arr{};
-        // positions are after permutations
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) { // if control bit is 1
                 if ((k >> target) & 1U) {
@@ -105,7 +104,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
     static auto applyInternalInternalDiagFactor(ParamT angle) {
         std::array<PrecisionT, packed_size> arr{};
 
-        // positions are after permutations
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) { // if control bit is 1
                 arr[2 * k + 0] = std::cos(angle / 2);
@@ -133,7 +132,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
             applyInternalInternalOffDiagFactor<control, target>(angle);
         const auto diag_factor =
             applyInternalInternalDiagFactor<control, target>(angle);
-
+        PL_LOOP_PARALLEL(1)
         for (size_t n = 0; n < exp2(num_qubits); n += packed_size / 2) {
             const auto v = PrecisionAVXConcept::load(arr + n);
             const auto diag_w = diag_factor * v;
@@ -150,7 +149,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
     template <size_t control, typename ParamT>
     static auto applyInternalExternalDiagFactor(ParamT angle) {
         std::array<Precision, packed_size> arr{};
-
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) {
                 // if control is 1
@@ -170,7 +169,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
     template <size_t control, typename ParamT>
     static auto applyInternalExternalOffDiagFactor(ParamT angle) {
         std::array<Precision, packed_size> arr{};
-
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> control) & 1U) {
                 // if control is 1
@@ -211,7 +210,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
         const auto off_diag_factor_p =
             applyInternalExternalOffDiagFactor<control>(angle);
         const auto off_diag_factor_m = -off_diag_factor_p;
-
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 1); k += packed_size / 2) {
             const size_t i0 =
                 ((k << 1U) & target_wire_parity_inv) | (target_wire_parity & k);
@@ -233,6 +232,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
     template <size_t target, typename ParamT>
     static auto applyExternalInternalOffDiagFactor(ParamT angle) {
         std::array<Precision, packed_size> arr{};
+        PL_LOOP_SIMD
         for (size_t k = 0; k < packed_size / 2; k++) {
             if ((k >> target) & 1U) { // target bit is 1 (was 0)
                 arr[2 * k + 0] = std::sin(angle / 2);
@@ -267,6 +267,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
         const auto offdiag_factor =
             applyExternalInternalOffDiagFactor<target>(angle);
 
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 1); k += packed_size / 2) {
             const size_t i0 =
                 ((k << 1U) & max_wire_parity_inv) | (max_wire_parity & k);
@@ -305,7 +306,7 @@ template <typename PrecisionT, size_t packed_size> struct ApplyCRY {
             set1<PrecisionT, packed_size>(std::cos(angle / 2));
         const auto sin_factor =
             set1<PrecisionT, packed_size>(std::sin(angle / 2));
-
+        PL_LOOP_PARALLEL(1)
         for (size_t k = 0; k < exp2(num_qubits - 2); k += packed_size / 2) {
             const size_t i00 = ((k << 2U) & parity_high) |
                                ((k << 1U) & parity_middle) | (k & parity_low);