PennyLaneAI · mlxd · Apr 25, 2024 · Mar 27, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add compile-time support for AVX2/512 streaming operations in `lightning.qubit`.
+  [(#664)](https://github.com/PennyLaneAI/pennylane-lightning/pull/664)
+
 * `lightning.kokkos` supports mid-circuit measurements.
   [(#672)](https://github.com/PennyLaneAI/pennylane-lightning/pull/672)
 
@@ -126,7 +129,7 @@
 
 This release contains contributions from (in alphabetical order):
 
-Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Mudit Pandey, Shuli Shu
+Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Lee James O'Riordan, Mudit Pandey, Shuli Shu
 
 ---
 

diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml
@@ -71,18 +71,31 @@ jobs:
               -DENABLE_COVERAGE=ON \
               -DLQ_ENABLE_KERNEL_OMP=ON
 
+            cmake . -BBuildKernelAVXStream -G Ninja \
+              -DCMAKE_BUILD_TYPE=Debug \
+              -DBUILD_TESTS=ON \
+              -DENABLE_PYTHON=OFF \
+              -DPL_BACKEND=${{ matrix.pl_backend }} \
+              -DCMAKE_CXX_COMPILER=$(which g++-$GCC_VERSION) \
+              -DENABLE_COVERAGE=ON \
+              -DLQ_ENABLE_KERNEL_AVX_STREAM=ON \
+              -DLQ_ENABLE_KERNEL_OMP=ON
+
+
             cmake --build ./Build
             cmake --build ./BuildKernelOMP
+            cmake --build ./BuildKernelAVXStream
 
-            for d in Build BuildKernelOMP; do
+            for d in Build BuildKernelOMP BuildKernelAVXStream; do
               cd ./$d
               mkdir -p ./tests/results
               for file in *runner ; do ./$file --order lex --reporter junit --out ./tests/results/report_$file.xml; done;
               lcov --directory . -b ../pennylane_lightning/core/src --capture --output-file coverage.info
               lcov --remove coverage.info '/usr/*' --output-file coverage.info
               cd ..
             done
-            lcov --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info -o coverage.info
+            lcov  --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info \
+                  --add-tracefile ./BuildKernelAVXStream/coverage.info -o coverage.info
             mv coverage.info coverage-${{ github.job }}-${{ matrix.pl_backend }}.info
 
       - name: Upload test results
@@ -93,6 +106,7 @@ jobs:
           path: |
             ./Build/tests/results/
             ./BuildKernelOMP/tests/results/
+            ./BuildKernelAVXStream/tests/results/
 
           if-no-files-found: error
 

diff --git a/doc/lightning_qubit/development/avx_kernels/index.rst b/doc/lightning_qubit/development/avx_kernels/index.rst
@@ -22,3 +22,4 @@ AVX2/AVX512 kernels
 
    implementation
    build_system
+   kernel_tuning
diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
@@ -0,0 +1,13 @@
+Kernel performance tuning
+#########################
+
+Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload.
+
+However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels.
+
+OpenMP threaded kernels
+-----------------------
+
+To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the ``-DLQ_ENABLE_KERNEL_OMP=ON`` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
+
+For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads.
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev40"
+__version__ = "0.36.0-dev41"
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
@@ -21,6 +21,7 @@ add_library(lightning_qubit STATIC ${LQUBIT_FILES})
 option(ENABLE_BLAS "Enable BLAS" OFF)
 option(ENABLE_GATE_DISPATCHER "Enable gate kernel dispatching on AVX/AVX2/AVX512" ON)
 option(LQ_ENABLE_KERNEL_OMP "Enable OpenMP pragmas for gate kernels" OFF)
+option(LQ_ENABLE_KERNEL_AVX_STREAMING "Enable AVX2/512 streaming operations for gate kernels" OFF)
 
 # Inform the compiler that this device is enabled.
 target_compile_options(lightning_compile_options INTERFACE "-D_ENABLE_PLQUBIT=1")
@@ -51,6 +52,13 @@ if(LQ_ENABLE_KERNEL_OMP)
     add_definitions("-DPL_LQ_KERNEL_OMP")
 endif()
 
+if(LQ_ENABLE_KERNEL_AVX_STREAMING)
+    if(NOT LQ_ENABLE_KERNEL_OMP)
+        message(WARNING "AVX streaming operations require `LQ_ENABLE_KERNEL_OMP` to be enabled.")
+    endif()
+    add_definitions("-DPL_LQ_KERNEL_AVX_STREAMING")
+endif()
+
 target_link_libraries(lightning_qubit PUBLIC    lightning_compile_options
                                                 lightning_external_libs
                                                 lightning_base

diff --git a/...ightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp b/...ightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
@@ -80,7 +80,7 @@ template <typename T> struct AVX2Concept {
     }
 
     PL_FORCE_INLINE
-    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+    static void store_(std::complex<PrecisionT> *p, IntrinsicType value) {
         if constexpr (std::is_same_v<PrecisionT, float>) {
             _mm256_store_ps(reinterpret_cast<PrecisionT *>(p), value);
         } else if (std::is_same_v<PrecisionT, double>) {
@@ -91,6 +91,43 @@ template <typename T> struct AVX2Concept {
         }
     }
 
+    PL_FORCE_INLINE
+    static void stream_(std::complex<PrecisionT> *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm256_stream_ps(reinterpret_cast<PrecisionT *>(p), value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm256_stream_pd(reinterpret_cast<PrecisionT *>(p), value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void stream_(PrecisionT *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm256_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm256_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+        store(reinterpret_cast<PrecisionT *>(p), value);
+    }
+    PL_FORCE_INLINE
+    static void store(PrecisionT *p, IntrinsicType value) {
+#ifdef PL_LQ_KERNEL_AVX_STREAMING
+        store_(p, value);
+#else
+        stream_(p, value);
+#endif
+    }
+
     PL_FORCE_INLINE
     static auto mul(IntrinsicType v0, IntrinsicType v1) {
         if constexpr (std::is_same_v<PrecisionT, float>) {

diff --git a/...htning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp b/...htning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
@@ -81,7 +81,7 @@
     }
 
     PL_FORCE_INLINE
-    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+    static void store_(std::complex<PrecisionT> *p, IntrinsicType value) {
         if constexpr (std::is_same_v<PrecisionT, float>) {
             _mm512_store_ps(p, value);
         } else if (std::is_same_v<PrecisionT, double>) {
@@ -92,6 +92,44 @@
         }
     }
 
+    PL_FORCE_INLINE
+    static void stream_(std::complex<PrecisionT> *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm512_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm512_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void stream_(PrecisionT *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm512_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm512_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+        store(reinterpret_cast<PrecisionT *>(p), value);
+    }
+
+    PL_FORCE_INLINE
+    static void store(PrecisionT *p, IntrinsicType value) {
+#ifdef PL_LQ_KERNEL_AVX_STREAMING
+        store_(p, value);
+#else
+        stream_(p, value);
+#endif
+    }
+
     PL_FORCE_INLINE
     static auto mul(IntrinsicType v0, IntrinsicType v1) {
         if constexpr (std::is_same_v<PrecisionT, float>) {