Enable AVX NE CONVERT for FP16 to FP32 cast

* Developed x86 and amd64 assembly kernel using AVX NE CONVERT. * Developed x86 assembly kernel using SSE instructions. * Added fallback implementation for FP16 to FP32 cast. * Runtime check to determine if CPU supports the ISA requiered for the kernel. * Added kernel dispatching logic on platform.cpp
microsoft · Sep 4, 2024 · a6b0a7e · a6b0a7e
1 parent cbf3c50
commit a6b0a7e
Show file tree

Hide file tree

Showing 12 changed files with 542 additions and 12 deletions.
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -40,6 +40,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/sqnbitgemm.cpp
   ${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
   ${MLAS_SRC_DIR}/flashattn.cpp
+  ${MLAS_SRC_DIR}/cast.cpp
 )
 
 target_sources(onnxruntime_mlas PRIVATE
@@ -212,6 +213,12 @@ function(setup_mlas_source_for_windows)
       ${MLAS_SRC_DIR}/amd64/TanhKernelFma3.asm
       ${MLAS_SRC_DIR}/amd64/ErfKernelFma3.asm
     )
+    if(MSVC_VERSION GREATER_EQUAL 1933)
+      target_sources(onnxruntime_mlas PRIVATE
+        ${MLAS_SRC_DIR}/amd64/cvtfp16Avx.asm
+      )
+    endif()
+
     if (NOT onnxruntime_ORT_MINIMAL_BUILD)
       target_sources(onnxruntime_mlas PRIVATE
         ${MLAS_SRC_DIR}/q4gemm_avx512.cpp
@@ -522,6 +529,12 @@ else()
           ${MLAS_SRC_DIR}/x86_64/SconvKernelSse2.S
           ${MLAS_SRC_DIR}/x86_64/SpoolKernelSse2.S
         )
+        if(NOT APPLE)
+          set(mlas_platform_srcs_sse2
+            ${mlas_platform_srcs_sse2}
+            ${MLAS_SRC_DIR}/x86_64/cvtfp16a.S
+          )
+        endif()
         set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
 
         set(mlas_platform_srcs_avx
@@ -555,6 +568,12 @@ else()
           ${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
         )
+        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 13.1 AND NOT(APPLE))
+          set(mlas_platform_srcs_avx2
+            ${mlas_platform_srcs_avx2}
+            ${MLAS_SRC_DIR}/x86_64/cvtfp16Avx.S
+          )
+        endif()
 
 message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
 message(STATUS "CMAKE_CXX_COMPILER_VERSION: ${CMAKE_CXX_COMPILER_VERSION}")

diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
@@ -135,6 +135,8 @@ void CPUIDInfo::X86Init() {
         if (max_SubLeaves >= 1) {
           GetCPUID(7, 1, data);
           has_avx512_bf16_ = has_avx512 && (data[0] & (1 << 5));
+          // Check for AVX_NE_CONVERT as half precision kernel uses it with AVX2 and F16C
+          has_avx_ne_convert_ = has_avx2_ && (data[3] & (1 << 5));
         }
       }
     }

diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
@@ -21,7 +21,8 @@ class CPUIDInfo {
   bool HasAVX512f() const { return has_avx512f_; }
   bool HasAVX512_BF16() const { return has_avx512_bf16_; }
   bool HasAVX512Skylake() const { return has_avx512_skylake_; }
-  bool HasF16C() const { return has_f16c_; } /*fp16 conversion inst*/
+  bool HasF16C() const { return has_f16c_; }                     /*fp16 conversion inst*/
+  bool HasAVX_NE_CONVERT() const { return has_avx_ne_convert_; } /*fp16/bf16 conversion inst*/
   bool HasSSE3() const { return has_sse3_; }
   bool HasSSE4_1() const { return has_sse4_1_; }
   bool IsHybrid() const { return is_hybrid_; }
@@ -101,6 +102,7 @@ class CPUIDInfo {
   bool has_avx512_bf16_{false};
   bool has_avx512_skylake_{false};
   bool has_f16c_{false};
+  bool has_avx_ne_convert_{false};
   bool has_sse3_{false};
   bool has_sse4_1_{false};
   bool is_hybrid_{false};

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -1029,14 +1029,13 @@ MlasComputeTanh(
 // Half-precision floating-point routines.
 //
 
-extern "C"
 void
 MLASCALL
 MlasConvertHalfToFloatBuffer(
     const unsigned short* Source,
     float* Destination,
     size_t Count
-    );
+);
 
 //
 // Transpose routines.

diff --git a/onnxruntime/core/mlas/lib/amd64/cvtfp16Avx.asm b/onnxruntime/core/mlas/lib/amd64/cvtfp16Avx.asm
@@ -0,0 +1,151 @@
+;++
+;
+; Copyright (c) Intel Corporation. All rights reserved.
+;
+; Licensed under the MIT License.
+;
+; Module Name:
+;
+;   cvtfp16Avx2.asm
+;
+; Abstract:
+;
+;   This module implements routines to convert between FP16 and FP32 formats using the AVX_NE_CONVERT ISA.
+;
+;--
+
+        .xlist
+INCLUDE mlasi.inc
+        .list
+
+        .const
+
+SINGLE_SIZE     equ 4
+HALF_SIZE       equ 2
+LOW_SELECTOR    equ 00100000b
+HIGH_SELECTOR   equ 00110001b
+
+        SUBTTL  "Convert buffer of half-precision floats to single-precision floats"
+;++
+;
+; Routine Description:
+;
+;   This routine converts the source buffer of half-precision floats to the
+;   destination buffer of single-precision floats.
+;
+;   This implementation uses AVX2 instructions.
+;
+; Arguments:
+;
+;   Source (rcx) - Supplies the address of the source buffer of half-precision
+;       floats.
+;
+;   Destination (rdx) - Supplies the address of the destination buffer of
+;       single-precision floats.
+;
+;   Count (r8) - Supplies the number of elements to convert.
+;
+; Return Value:
+;
+;   None.
+;
+;--
+
+
+LEAF_ENTRY MlasCastF16ToF32KernelAvx, _TEXT
+
+	test    r8, r8                  ; Check if we have any elements to convert
+    jz      ExitRoutine
+    cmp     r8, 8
+    jb      ConvertMaskedVectors
+    cmp     r8, 16
+    jb      Convert128Vectors
+
+
+
+Convert256Vectors:
+        vcvtneeph2ps    ymm0, ymmword PTR [rcx]                 ; Load even indexes
+        vcvtneoph2ps    ymm1, ymmword PTR [rcx]                 ; Load odd indexes
+        vunpcklps       ymm2, ymm0, ymm1                        ; Interleave low part
+        vunpckhps       ymm1, ymm0, ymm1                        ; Interleave high part
+        vperm2f128      ymm0, ymm2, ymm1, LOW_SELECTOR   	    ; Fix the order
+        vperm2f128      ymm1, ymm2, ymm1, HIGH_SELECTOR   	    ; Fix the order
+        vmovups         ymmword PTR [rdx], ymm0                 ; Store the low part
+        vmovups         ymmword PTR [rdx + 8*SINGLE_SIZE], ymm1 ; Store the high part
+
+        add     rcx, 16*HALF_SIZE   ; Advance src ptr by 16 elements
+        add     rdx, 16*SINGLE_SIZE ; Advance dest ptr by 16 elements
+        sub     r8, 16              ; Reduce the counter by 16 elements
+
+        jz      ExitRoutine ; If we are done, exit
+        cmp     r8, 16      ; If the vector is big enough, we go again
+        jae     Convert256Vectors
+
+
+
+Convert128Vectors:
+        vcvtneeph2ps    xmm2, xmmword PTR [rcx]                 ; Load even indexes
+        vcvtneoph2ps    xmm1, xmmword PTR [rcx]                 ; Load odd indexes
+        vunpcklps       xmm0, xmm2, xmm1                        ; Interleave low part to fix order
+        vunpckhps       xmm1, xmm2, xmm1                        ; Interleave high part to fix order
+        vmovups         xmmword PTR [rdx], xmm0                 ; Store the low part
+        vmovups         xmmword PTR [rdx + 4*SINGLE_SIZE], xmm1 ; Store the high part
+
+        add     rcx, 8*HALF_SIZE    ; Advance src ptr by 8 elements
+        add     rdx, 8*SINGLE_SIZE  ; Advance dest ptr by 8 elements
+        sub     r8, 8               ; Reduce the counter by 8 elements
+
+        jz      ExitRoutine ; If we are done, exit
+
+
+
+ConvertMaskedVectors:
+        vcvtneeph2ps    xmm2, xmmword PTR [rcx]         ; Load even indexes
+        vcvtneoph2ps    xmm1, xmmword PTR [rcx]         ; Load odd indexes
+        vunpcklps       xmm0, xmm2, xmm1                ; Interleave low part to fix order
+        vunpckhps       xmm1, xmm2, xmm1                ; Interleave high part to fix order
+
+        cmp     r8, 4   ; Check if we can store the complete lower vector
+        jae     ConvertLowerVector
+
+        vpcmpeqw    xmm2, xmm2, xmm2                ; Initialize the mask full of ones
+        cmp         r8, 2                           ; Check how many converts we need
+        jb          ConvertLower1
+        ja          ConvertLower3
+        vpsrldq     xmm2, xmm2, SINGLE_SIZE*2       ; Shift the memory store two values
+        jmp         ConvertLowerMaskedVector
+ConvertLower1:
+        vpsrldq     xmm2, xmm2, SINGLE_SIZE*3       ; Shift the memory store only one value
+        jmp         ConvertLowerMaskedVector
+ConvertLower3:
+        vpsrldq     xmm2, xmm2, SINGLE_SIZE         ; Shift the memory store three values
+ConvertLowerMaskedVector:
+        vmaskmovps  xmmword PTR [rdx], xmm2, xmm0   ; Store the masked data, the shift is done in 8bit multiples
+        jmp ExitRoutine                             ; If we ran into any of the cases above, means we are done after storing
+ConvertLowerVector:
+        vmovups xmmword PTR [rdx], xmm0             ; Store the low part
+        sub     r8, 4                               ; Check if we still need to convert
+        jz      ExitRoutine
+
+
+        add         rdx, 4*SINGLE_SIZE              ; Advance dest ptr by 4 elements
+        vpcmpeqw    xmm2, xmm2, xmm2                ; Initialize the mask full of ones
+        cmp         r8, 2                           ; Check how many converts we need
+        jb          ConvertUpper1
+        ja          ConvertUpper3
+        vpsrldq     xmm2, xmm2, SINGLE_SIZE*2       ; Shift the memory store two values
+        jmp         ConvertMaskedUpperVector
+ConvertUpper1:
+        vpsrldq     xmm2, xmm2, SINGLE_SIZE*3       ; Shift the memory store only one value
+        jmp         ConvertMaskedUpperVector
+ConvertUpper3:
+        vpsrldq     xmm2, xmm2, SINGLE_SIZE         ; Shift the memory store three values
+ConvertMaskedUpperVector:
+        vmaskmovps  xmmword PTR [rdx], xmm2, xmm1   ; Store the masked data, the shift is done in 8bit multiples
+
+ExitRoutine:
+        ret
+
+        LEAF_END MlasCastF16ToF32KernelAvx, _TEXT
+
+        END
diff --git a/onnxruntime/core/mlas/lib/amd64/cvtfp16a.asm b/onnxruntime/core/mlas/lib/amd64/cvtfp16a.asm
@@ -42,7 +42,7 @@ MlasFp16MagicDenormal           DD      4 DUP (38800000h)
 ;   Source (rcx) - Supplies the address of the source buffer of half-precision
 ;       floats.
 ;
-;   Destination (edx) - Supplies the address of the destination buffer of
+;   Destination (rdx) - Supplies the address of the destination buffer of
 ;       single-precision floats.
 ;
 ;   Count (r8) - Supplies the number of elements to convert.
@@ -53,7 +53,7 @@ MlasFp16MagicDenormal           DD      4 DUP (38800000h)
 ;
 ;--
 
-        LEAF_ENTRY MlasConvertHalfToFloatBuffer, _TEXT
+        LEAF_ENTRY MlasCastF16ToF32KernelSse, _TEXT
 
         test    r8,r8
         jz      ExitRoutine
@@ -119,6 +119,6 @@ StoreLastElement:
 ExitRoutine:
         ret
 
-        LEAF_END MlasConvertHalfToFloatBuffer, _TEXT
+        LEAF_END MlasCastF16ToF32KernelSse, _TEXT
 
         END
diff --git a/onnxruntime/core/mlas/lib/cast.cpp b/onnxruntime/core/mlas/lib/cast.cpp
@@ -0,0 +1,59 @@
+/*++
+
+Copyright (c) Intel Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    cast.cpp
+
+Abstract:
+
+    This module implements Half (F16) to Single (F32) precision casting.
+
+--*/
+#include "mlasi.h"
+
+union fp32_bits {
+    uint32_t u;
+    float f;
+};
+
+void
+MLASCALL
+MlasConvertHalfToFloatBuffer(
+    const unsigned short* Source,
+    float* Destination,
+    size_t Count
+)
+{
+
+    if (GetMlasPlatform().CastF16ToF32Kernel == nullptr) {
+        // If there is no kernel use the reference implementation, adapted from mlas_float16.h.
+        constexpr fp32_bits magic = {113 << 23};
+        constexpr uint32_t shifted_exp = 0x7c00 << 13;  // exponent mask after shift
+
+        for (size_t i = 0; i < Count; ++i) {
+            fp32_bits o;
+            o.u = (Source[i] & 0x7fff) << 13;  // exponent/mantissa bits
+            uint32_t exp = shifted_exp & o.u;  // just the exponent
+            o.u += (127 - 15) << 23;           // exponent adjust
+
+            // handle exponent special cases
+            if (exp == shifted_exp) {     // Inf/NaN?
+                o.u += (128 - 16) << 23;  // extra exp adjust
+            } else if (exp == 0) {        // Zero/Denormal?
+                o.u += 1 << 23;           // extra exp adjust
+                o.f -= magic.f;           // renormalize
+            }
+
+            o.u |= (Source[i] & 0x8000) << 16;  // sign bit
+            Destination[i] = o.f;
+        }
+
+    } else {
+        // If the kernel is available, use it to perform the conversion.
+        GetMlasPlatform().CastF16ToF32Kernel(Source, Destination, Count);
+    }
+}
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
@@ -610,6 +610,13 @@ void
     size_t N
     );
 
+typedef 
+void(MLASCALL MLAS_CAST_F16_TO_F32_KERNEL)(
+    const unsigned short* Source,
+    float* Destination,
+    size_t Count
+);
+
 typedef
 void
 (MLASCALL MLAS_QLINEAR_BINARY_OP_S8_KERNEL)(
@@ -870,6 +877,11 @@ extern "C" {
     MLAS_REDUCE_MINIMUM_MAXIMUM_FLOAT_KERNEL MlasReduceMinimumMaximumF32KernelAvx;
 #endif
 
+#if defined(MLAS_TARGET_AMD64)
+    MLAS_CAST_F16_TO_F32_KERNEL MlasCastF16ToF32KernelSse;
+    MLAS_CAST_F16_TO_F32_KERNEL MlasCastF16ToF32KernelAvx;
+#endif
+
 }
 
 //
@@ -1151,6 +1163,8 @@ struct MLAS_PLATFORM {
     const MLAS_Q8Q4GEMM_DISPATCH* Q8Q4GemmDispatch{nullptr};
 
     const MLAS_SQNBIT_GEMM_DISPATCH* SQNBitGemmDispatch{nullptr};
+
+    MLAS_CAST_F16_TO_F32_KERNEL* CastF16ToF32Kernel;
 };
 
 inline

diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -244,6 +244,7 @@ Return Value:
     this->ConvDepthwiseU8U8Kernel = MlasConvDepthwiseKernel<uint8_t, uint8_t>;
     this->ConvDepthwiseS8S8Kernel = MlasConvDepthwiseKernel<int8_t, int8_t>;
     this->ConvDepthwiseS8U8Kernel = MlasConvDepthwiseKernel<int8_t, uint8_t>;
+    this->CastF16ToF32Kernel = nullptr;
 
 #if defined(MLAS_TARGET_AMD64_IX86)
 
@@ -283,6 +284,9 @@ Return Value:
     this->QuantizeLinearU16Kernel = MlasQuantizeLinearU16Kernel;
     this->QuantizeLinearS4Kernel = MlasQuantizeLinearS4Kernel;
     this->QuantizeLinearU4Kernel = MlasQuantizeLinearU4Kernel;
+#ifndef __APPLE__
+    this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelSse;
+#endif  // __APPLE__
 
     this->NchwcBlockSize = 8;
     this->PreferredBufferAlignment = MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT;
@@ -469,6 +473,16 @@ Return Value:
                 }
 
 #ifndef __APPLE__
+#if (defined(_MSC_VER) && (_MSC_VER >= 1933)) || (defined(__GNUC__) && (__GNUC__ >= 13))
+                //
+                // Check if the processor supports AVX NE CONVERT.
+                //
+                if ((Cpuid7_1[3] & (0b1 << 5)) != 0) {
+                    this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelAvx;
+                }
+#endif  // (defined(_MSC_VER) && (_MSC_VER >= 1933)) || (defined(__GNUC__) && (__GNUC__ >= 13))
+
+
                 //
                 // Check if the processor supports AMX-TILE and AMX-INT8
                 // features.