From edffa2a180d4219e547d3c22af292ad071f4141f Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Thu, 25 Apr 2024 08:28:59 -0700
Subject: [PATCH] Optimize MlasComputeSoftmax with prefetch (#20393)

The prefetching instructions (_mm_prefetch) is used to anticipate memory
accesses by prefetching the next row of the input buffer. This
optimization is designed to reduce the impact of memory latency, thereby
enhancing the performance of the MlasComputeSoftmax function. As a
result, the worst-case performance of the OCR model has improved by
approximately 50ms, which equates to a 3% improvement.
---
 onnxruntime/core/mlas/lib/compute.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp
index 78cac2e617ff7..f4c1e3da69289 100644
--- a/onnxruntime/core/mlas/lib/compute.cpp
+++ b/onnxruntime/core/mlas/lib/compute.cpp
@@ -850,8 +850,24 @@ Return Value:
     const float* Input = WorkBlock->Input + n * D;
     float* Output = WorkBlock->Output + n * D;
 
+#if defined(MLAS_SSE2_INTRINSICS)
+    // TODO: Use std::hardware_constructive_interference_size
+    constexpr size_t CacheLineSize = 64;
+    constexpr size_t ElementsPerCacheLine = CacheLineSize / sizeof(float);
+#endif
+
     while (CountN > 0) {
 
+#if defined(MLAS_SSE2_INTRINSICS)
+        //
+        // Prefetch the next row of the input buffer.
+        //
+
+        for (size_t i = 0; i * ElementsPerCacheLine < D; i++) {
+            _mm_prefetch((char*)(Input + D) + i * CacheLineSize, _MM_HINT_T0);
+        }
+#endif
+
         //
         // Find the maximum value for the row.
         //