From edffa2a180d4219e547d3c22af292ad071f4141f Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Thu, 25 Apr 2024 08:28:59 -0700 Subject: [PATCH] Optimize MlasComputeSoftmax with prefetch (#20393) The prefetching instructions (_mm_prefetch) is used to anticipate memory accesses by prefetching the next row of the input buffer. This optimization is designed to reduce the impact of memory latency, thereby enhancing the performance of the MlasComputeSoftmax function. As a result, the worst-case performance of the OCR model has improved by approximately 50ms, which equates to a 3% improvement. --- onnxruntime/core/mlas/lib/compute.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp index 78cac2e617ff7..f4c1e3da69289 100644 --- a/onnxruntime/core/mlas/lib/compute.cpp +++ b/onnxruntime/core/mlas/lib/compute.cpp @@ -850,8 +850,24 @@ Return Value: const float* Input = WorkBlock->Input + n * D; float* Output = WorkBlock->Output + n * D; +#if defined(MLAS_SSE2_INTRINSICS) + // TODO: Use std::hardware_constructive_interference_size + constexpr size_t CacheLineSize = 64; + constexpr size_t ElementsPerCacheLine = CacheLineSize / sizeof(float); +#endif + while (CountN > 0) { +#if defined(MLAS_SSE2_INTRINSICS) + // + // Prefetch the next row of the input buffer. + // + + for (size_t i = 0; i * ElementsPerCacheLine < D; i++) { + _mm_prefetch((char*)(Input + D) + i * CacheLineSize, _MM_HINT_T0); + } +#endif + // // Find the maximum value for the row. //