diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp index db3b9ee656592..ec5cdbc75220a 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp @@ -155,7 +155,7 @@ namespace template MLAS_FORCEINLINE void -SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( +SQ4BitGemm_CompInt8_Compute4x2_BlkLen16( const std::byte* QuantARowPtr, const std::byte* QuantBDataColPtr, const float* QuantBScaleColPtr, @@ -177,11 +177,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( const float* QuantBScalePtr = QuantBScaleColPtr; const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr; - float32x4_t acc00{}, acc01{}, acc10{}, acc11{}; + float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{}; for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) { const std::byte* QuantABlkRow0 = QuantAPtr; const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA; + const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2; + const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3; const float QuantBScaleCol0 = *QuantBScalePtr; const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale); @@ -191,6 +193,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1; const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0; const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1; + const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0; + const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1; + const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0; + const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1; // load B zero point int8_t bzp_col0; @@ -212,13 +218,11 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0); const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1); + const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2); + const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3); // TODO handling only 16 elements per accumulator at a time here, probably can do better { - // load A - const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0); - const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0); - // load B const uint8x8_t bv_packed_col0 = vld1_u8(reinterpret_cast(QuantBDataPtr)); const uint8x8_t bv_packed_col1 = vld1_u8(reinterpret_cast(QuantBDataPtr) + StrideQuantBData); @@ -242,24 +246,55 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( bv_col0 = vsubq_s8(bv_col0, vdupq_n_s8(bzp_col0)); bv_col1 = vsubq_s8(bv_col1, vdupq_n_s8(bzp_col1)); - // quantized dot product - int32x4_t dot00{}, dot01{}, dot10{}, dot11{}; - dot00 = vdotq_s32(dot00, av_row0, bv_col0); - dot01 = vdotq_s32(dot01, av_row0, bv_col1); - dot10 = vdotq_s32(dot10, av_row1, bv_col0); - dot11 = vdotq_s32(dot11, av_row1, bv_col1); - - // convert to float - const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00); - const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01); - const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10); - const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11); + // rows 0 and 1 of A + { + // load A + const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0); + const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0); + + // quantized dot product + const int32x4_t dot00 = vdotq_s32(int32x4_t{}, av_row0, bv_col0); + const int32x4_t dot01 = vdotq_s32(int32x4_t{}, av_row0, bv_col1); + const int32x4_t dot10 = vdotq_s32(int32x4_t{}, av_row1, bv_col0); + const int32x4_t dot11 = vdotq_s32(int32x4_t{}, av_row1, bv_col1); + + // convert to float + const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00); + const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01); + const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10); + const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11); + + // multiply by scale and update accumulator + acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00)); + acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01)); + acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10)); + acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11)); + } - // multiply by scale and update accumulator - acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00)); - acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01)); - acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10)); - acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11)); + // rows 2 and 3 of A + { + // load A + const int8x16_t av_row2 = vld1q_s8(QuantADataPtrRow2 + 0); + const int8x16_t av_row3 = vld1q_s8(QuantADataPtrRow3 + 0); + + // quantized dot product + const int32x4_t dot20 = vdotq_s32(int32x4_t{}, av_row2, bv_col0); + const int32x4_t dot21 = vdotq_s32(int32x4_t{}, av_row2, bv_col1); + const int32x4_t dot30 = vdotq_s32(int32x4_t{}, av_row3, bv_col0); + const int32x4_t dot31 = vdotq_s32(int32x4_t{}, av_row3, bv_col1); + + // convert to float + const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20); + const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21); + const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30); + const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31); + + // multiply by scale and update accumulator + acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20)); + acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21)); + acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30)); + acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31)); + } } // increment block pointers @@ -273,22 +308,30 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( } } - SumPtr[0] = vaddvq_f32(acc00); - SumPtr[1] = vaddvq_f32(acc01); - SumPtr[ldc + 0] = vaddvq_f32(acc10); - SumPtr[ldc + 1] = vaddvq_f32(acc11); + SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00); + SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01); + SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10); + SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11); + SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20); + SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21); + SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30); + SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31); if (BiasPtr != nullptr) { - SumPtr[0] += BiasPtr[0]; - SumPtr[1] += BiasPtr[1]; - SumPtr[ldc + 0] += BiasPtr[0]; - SumPtr[ldc + 1] += BiasPtr[1]; + SumPtr[ldc * 0 + 0] += BiasPtr[0]; + SumPtr[ldc * 0 + 1] += BiasPtr[1]; + SumPtr[ldc * 1 + 0] += BiasPtr[0]; + SumPtr[ldc * 1 + 1] += BiasPtr[1]; + SumPtr[ldc * 2 + 0] += BiasPtr[0]; + SumPtr[ldc * 2 + 1] += BiasPtr[1]; + SumPtr[ldc * 3 + 0] += BiasPtr[0]; + SumPtr[ldc * 3 + 1] += BiasPtr[1]; } } template MLAS_FORCEINLINE void -SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( +SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16( size_t BlkLen, const std::byte* QuantARowPtr, const std::byte* QuantBDataColPtr, @@ -312,11 +355,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( const float* QuantBScalePtr = QuantBScaleColPtr; const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr; - float32x4_t acc00{}, acc01{}, acc10{}, acc11{}; + float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{}; for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) { const std::byte* QuantABlkRow0 = QuantAPtr; const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA; + const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2; + const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3; const float QuantBScaleCol0 = *QuantBScalePtr; const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale); @@ -326,6 +371,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1; const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0; const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1; + const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0; + const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1; + const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0; + const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1; // load B zero point int8_t bzp_col0; @@ -347,14 +396,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0); const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1); + const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2); + const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3); for (size_t sub_blk_idx = 0; sub_blk_idx < SubBlksPerBlk; ++sub_blk_idx) { - // load A - const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0); - const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16); - const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0); - const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16); - // load B const uint8x16_t bv_packed_col0 = vld1q_u8(reinterpret_cast(QuantBDataPtr)); const uint8x16_t bv_packed_col1 = vld1q_u8(reinterpret_cast(QuantBDataPtr) + StrideQuantBData); @@ -372,28 +417,65 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( bv_col1_0 = vsubq_s8(bv_col1_0, vdupq_n_s8(bzp_col1)); bv_col1_1 = vsubq_s8(bv_col1_1, vdupq_n_s8(bzp_col1)); - // quantized dot product - int32x4_t dot00{}, dot01{}, dot10{}, dot11{}; - dot00 = vdotq_s32(vdotq_s32(dot00, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1); - dot01 = vdotq_s32(vdotq_s32(dot01, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1); - dot10 = vdotq_s32(vdotq_s32(dot10, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1); - dot11 = vdotq_s32(vdotq_s32(dot11, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1); - - // convert to float - const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00); - const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01); - const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10); - const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11); + // rows 0 and 1 of A + { + // load A + const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0); + const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16); + const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0); + const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16); + + // quantized dot product + const int32x4_t dot00 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1); + const int32x4_t dot01 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1); + const int32x4_t dot10 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1); + const int32x4_t dot11 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1); + + // convert to float + const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00); + const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01); + const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10); + const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11); + + // multiply by scale and update accumulator + acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00)); + acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01)); + acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10)); + acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11)); + } - // multiply by scale and update accumulator - acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00)); - acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01)); - acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10)); - acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11)); + // rows 2 and 3 of A + { + // load A + const int8x16_t av_row2_0 = vld1q_s8(QuantADataPtrRow2 + 0); + const int8x16_t av_row2_1 = vld1q_s8(QuantADataPtrRow2 + 16); + const int8x16_t av_row3_0 = vld1q_s8(QuantADataPtrRow3 + 0); + const int8x16_t av_row3_1 = vld1q_s8(QuantADataPtrRow3 + 16); + + // quantized dot product + const int32x4_t dot20 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col0_0), av_row2_1, bv_col0_1); + const int32x4_t dot21 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col1_0), av_row2_1, bv_col1_1); + const int32x4_t dot30 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col0_0), av_row3_1, bv_col0_1); + const int32x4_t dot31 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col1_0), av_row3_1, bv_col1_1); + + // convert to float + const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20); + const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21); + const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30); + const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31); + + // multiply by scale and update accumulator + acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20)); + acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21)); + acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30)); + acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31)); + } // increment block data pointers to next sub-block QuantADataPtrRow0 += 32; QuantADataPtrRow1 += 32; + QuantADataPtrRow2 += 32; + QuantADataPtrRow3 += 32; QuantBDataPtr += 16; } @@ -407,16 +489,24 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( } } - SumPtr[0] = vaddvq_f32(acc00); - SumPtr[1] = vaddvq_f32(acc01); - SumPtr[ldc + 0] = vaddvq_f32(acc10); - SumPtr[ldc + 1] = vaddvq_f32(acc11); + SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00); + SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01); + SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10); + SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11); + SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20); + SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21); + SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30); + SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31); if (BiasPtr != nullptr) { - SumPtr[0] += BiasPtr[0]; - SumPtr[1] += BiasPtr[1]; - SumPtr[ldc + 0] += BiasPtr[0]; - SumPtr[ldc + 1] += BiasPtr[1]; + SumPtr[ldc * 0 + 0] += BiasPtr[0]; + SumPtr[ldc * 0 + 1] += BiasPtr[1]; + SumPtr[ldc * 1 + 0] += BiasPtr[0]; + SumPtr[ldc * 1 + 1] += BiasPtr[1]; + SumPtr[ldc * 2 + 0] += BiasPtr[0]; + SumPtr[ldc * 2 + 1] += BiasPtr[1]; + SumPtr[ldc * 3 + 0] += BiasPtr[0]; + SumPtr[ldc * 3 + 1] += BiasPtr[1]; } } @@ -478,8 +568,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( bv1 = vsubq_s8(bv1, bzp1); // quantized dot product - const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0); - const int32x4_t dot1 = vdotq_s32(vdupq_n_s32(0), av1, bv1); + const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0); + const int32x4_t dot1 = vdotq_s32(int32x4_t{}, av1, bv1); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -527,7 +617,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( bv0 = vsubq_s8(bv0, bzp0); // quantized dot product - const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0); + const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -604,9 +694,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( bv_hi1 = vsubq_s8(bv_hi1, bzp1); // quantized dot product - int32x4_t dot0{}, dot1{}; - dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0); - dot1 = vdotq_s32(vdotq_s32(dot1, av_lo1, bv_lo1), av_hi1, bv_hi1); + const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0); + const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo1, bv_lo1), av_hi1, bv_hi1); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -652,8 +741,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( bv_hi0 = vsubq_s8(bv_hi0, bzp0); // quantized dot product - int32x4_t dot0{}; - dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0); + const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -736,9 +824,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32( bv3 = vsubq_s8(bv3, bzp); // quantized dot product - int32x4_t dot0{}, dot1{}; - dot0 = vdotq_s32(vdotq_s32(dot0, av0, bv0), av1, bv1); - dot1 = vdotq_s32(vdotq_s32(dot1, av2, bv2), av3, bv3); + const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av0, bv0), av1, bv1); + const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av2, bv2), av3, bv3); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -834,7 +921,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen16( float* SumRowPtr = C; size_t m_remaining = CountM; - while (m_remaining > 1) { + while (m_remaining > 3) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -845,8 +932,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen16( size_t n_remaining = CountN; while (n_remaining > 1) { - // Compute 2x2 tiles of output - SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( + // Compute 4x2 tiles of output + SQ4BitGemm_CompInt8_Compute4x2_BlkLen16( QuantARowPtr, QuantBDataColPtr, QuantBScaleColPtr, @@ -871,38 +958,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen16( } if (n_remaining > 0) { - // Compute last 2x1 tile of output - SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( - QuantARowPtr, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr, - BlockCountK - ); - - SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( - QuantARowPtr + StrideQuantA, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr + ldc, - BlockCountK - ); + // Compute last 4x1 tile of output + for (size_t i = 0; i < 4; ++i) { + SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( + QuantARowPtr + StrideQuantA * i, + QuantBDataColPtr, + QuantBScaleColPtr, + QuantBZeroPointColPtr, + BiasPtr, + SumPtr + ldc * i, + BlockCountK + ); + } } - // Move to next 2 rows - AdvanceRowPtrs<2>( + // Move to next 4 rows + AdvanceRowPtrs<4>( StrideQuantA, ldc, QuantARowPtr, SumRowPtr ); - m_remaining -= 2; + m_remaining -= 4; } - if (m_remaining > 0) { + while (m_remaining > 0) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -932,6 +1011,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen16( n_remaining -= 1; } + + // Move to next row + AdvanceRowPtrs<1>( + StrideQuantA, ldc, + QuantARowPtr, SumRowPtr + ); + + m_remaining -= 1; } } @@ -964,7 +1051,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen32( float* SumRowPtr = C; size_t m_remaining = CountM; - while (m_remaining > 1) { + while (m_remaining > 3) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -975,8 +1062,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen32( size_t n_remaining = CountN; while (n_remaining > 1) { - // Compute 2x2 tiles of output - SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( + // Compute 4x2 tiles of output + SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16( BlkLen, QuantARowPtr, QuantBDataColPtr, @@ -1002,38 +1089,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen32( } if (n_remaining > 0) { - // Compute last 2x1 tile of output - SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( - QuantARowPtr, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr, - BlockCountK - ); - - SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( - QuantARowPtr + StrideQuantA, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr + ldc, - BlockCountK - ); + // Compute last 4x1 tile of output + for (size_t i = 0; i < 4; ++i) { + SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( + QuantARowPtr + StrideQuantA * i, + QuantBDataColPtr, + QuantBScaleColPtr, + QuantBZeroPointColPtr, + BiasPtr, + SumPtr + ldc * i, + BlockCountK + ); + } } - // Move to next 2 rows - AdvanceRowPtrs<2>( + // Move to next 4 rows + AdvanceRowPtrs<4>( StrideQuantA, ldc, QuantARowPtr, SumRowPtr ); - m_remaining -= 2; + m_remaining -= 4; } - if (m_remaining > 0) { + while (m_remaining > 0) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -1063,6 +1142,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen32( n_remaining -= 1; } + + // Move to next row + AdvanceRowPtrs<1>( + StrideQuantA, ldc, + QuantARowPtr, SumRowPtr + ); + + m_remaining -= 1; } } @@ -1095,7 +1182,7 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32( float* SumRowPtr = C; size_t m_remaining = CountM; - while (m_remaining > 1) { + while (m_remaining > 3) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -1106,8 +1193,8 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32( size_t n_remaining = CountN; while (n_remaining > 1) { - // Compute 2x2 tiles of output - SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( + // Compute 4x2 tiles of output + SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16( BlkLen, QuantARowPtr, QuantBDataColPtr, @@ -1133,40 +1220,31 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32( } if (n_remaining > 0) { - // Compute last 2x1 tile of output - SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32( - BlkLen, - QuantARowPtr, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr, - BlockCountK - ); - - SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32( - BlkLen, - QuantARowPtr + StrideQuantA, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr + ldc, - BlockCountK - ); + // Compute last 4x1 tile of output + for (size_t i = 0; i < 4; ++i) { + SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32( + BlkLen, + QuantARowPtr + StrideQuantA * i, + QuantBDataColPtr, + QuantBScaleColPtr, + QuantBZeroPointColPtr, + BiasPtr, + SumPtr + ldc * i, + BlockCountK + ); + } } - // Move to next 2 rows - AdvanceRowPtrs<2>( + // Move to next 4 rows + AdvanceRowPtrs<4>( StrideQuantA, ldc, QuantARowPtr, SumRowPtr ); - m_remaining -= 2; + m_remaining -= 4; } - if (m_remaining > 0) { + while (m_remaining > 0) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -1197,6 +1275,14 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32( n_remaining -= 1; } + + // Move to next row + AdvanceRowPtrs<1>( + StrideQuantA, ldc, + QuantARowPtr, SumRowPtr + ); + + m_remaining -= 1; } } diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py index 9ee8f27df5c99..2f335009b59c6 100644 --- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py +++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py @@ -15,12 +15,10 @@ from typing import List, Optional TRT_DOCKER_FILES = { - "8.4.cuda_11_6_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4", - "8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5", "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6", - "10.0.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0", - "10.0.cuda_12_4_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0", + "10.2.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10", + "10.2.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10", "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin", } diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md index 6fba98c14e792..cd8a8756d681e 100644 --- a/onnxruntime/python/tools/transformers/models/llama/README.md +++ b/onnxruntime/python/tools/transformers/models/llama/README.md @@ -27,8 +27,6 @@ Please note the package versions needed for using LLaMA-2 in the `requirements.t - Note that `torch` with CUDA enabled is not installed automatically. This is because `torch` should be installed with the CUDA version used on your machine. Please visit [the PyTorch website](https://pytorch.org/get-started/locally/) to download the `torch` version that is used with the CUDA version installed on your machine and satisfies the requirement listed in the file. - `requirements-quant.txt` - For running the SmoothQuant algorithm using [Intel's Neural Compressor](https://github.com/intel/neural-compressor) -- `requirements-70b-model.txt` - - For running the LLaMA-2 70B model on multiple GPUs - `requirements.txt` - Package versions needed in each of the above files @@ -221,18 +219,6 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-cpu --precision int4 --quantization_method blockwise --execution_provider cpu --use_gqa ``` -Export LLaMA-2 70B sharded model into 4 partitions -``` -# From source: -# 1. Install necessary packages from requirements-70b-model.txt -$ pip install -r requirements-70b-model.txt - -# 2. Build ONNX Runtime from source with NCCL enabled. Here is a sample command: -$ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/ - -# 3. Shard and export the LLaMA-2 70B model. With FP16, you will need at least 140GB of GPU memory to load the model. Therefore, you will need at least 4 40GB A100 GPUs or 2 80GB A100 GPUs to shard the PyTorch model and export each shard to ONNX. Here is an example command: -$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa -``` ## Parity Checking LLaMA-2 @@ -395,18 +381,6 @@ CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \ --device cuda ``` -9. ONNX Runtime, FP16, convert_to_onnx, LLaMA-2 70B shard to 4 GPUs -``` -CUDA_VISIBLE_DEVICES=4,5,6,7 bash benchmark_70b_model.sh 4 \ - --benchmark-type ort-convert-to-onnx \ - --ort-model-path ./llama2-70b-dis/rank_{}_Llama-2-70b-hf_decoder_merged_model_fp16.onnx \ - --model-name meta-llama/Llama-2-70b-hf \ - --cache-dir ./model_cache \ - --precision fp16 \ - --device cuda \ - --warmup-runs 5 \ - --num-runs 100 -``` You can profile a variant by adding the `--profile` flag and providing one batch size and sequence length combination. diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh deleted file mode 100644 index 38f1916456658..0000000000000 --- a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -NUM_GPUS=${1:-1} - -MPI="mpirun --allow-run-as-root - -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 - --tag-output --npernode $NUM_GPUS --bind-to numa - -x MIOPEN_FIND_MODE=1" - -CMD="$MPI python benchmark.py ${@:2}" - -$CMD \ No newline at end of file diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh deleted file mode 100644 index 637d15c10e0c7..0000000000000 --- a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -NUM_GPUS=${1:-1} - -MPI="mpirun --allow-run-as-root - -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 - --tag-output --npernode $NUM_GPUS --bind-to numa - -x MIOPEN_FIND_MODE=1" - -CMD="$MPI python convert_to_onnx.py ${@:2}" - -$CMD \ No newline at end of file diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt deleted file mode 100644 index 572cfdb71be4a..0000000000000 --- a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt +++ /dev/null @@ -1,4 +0,0 @@ --r requirements.txt -git+https://github.com/frankdongms/transformers.git@frdong/shard_llama -mpi4py -psutil \ No newline at end of file diff --git a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc index bd97306142f18..4fc2e6c7c909b 100644 --- a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc @@ -18,13 +18,17 @@ constexpr double DOUBLE_NINF = -std::numeric_limits::infinity(); constexpr double DOUBLE_NAN = std::numeric_limits::quiet_NaN(); template -void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list& input, const std::initializer_list& output) { +void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list& input, const std::initializer_list& output, bool skip_trt = false) { OpTester test("IsInf", opset); test.AddAttribute("detect_positive", detect_positive); test.AddAttribute("detect_negative", detect_negative); test.AddInput("X", {onnxruntime::narrow(input.size())}, input); test.AddOutput("Y", {onnxruntime::narrow(output.size())}, output); - test.Run(); + if (skip_trt) { + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); + } else { + test.Run(); + } } TEST(IsInfTest, test_isinf_float10) { @@ -124,7 +128,7 @@ TEST(IsInfTest, test_isinf_bfloat16) { std::initializer_list input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16, BFloat16::NegativeInfinity, BFloat16::Infinity}; std::initializer_list output = {false, false, true, false, true, true}; - run_is_inf_test(20, 1, 1, input, output); + run_is_inf_test(20, 1, 1, input, output, true); // Skip as TRT10 supports BF16 but T4 GPU run on TRT CIs doesn't } TEST(IsInfTest, test_isinf_positive_bfloat16) { @@ -146,7 +150,7 @@ TEST(IsInfTest, test_Float8E4M3FN) { std::initializer_list input = { Float8E4M3FN(-1.0f), Float8E4M3FN(FLOAT_NAN, false), Float8E4M3FN(1.0f), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_INF, false)}; std::initializer_list output = {false, false, false, false, false, false}; - run_is_inf_test(20, 1, 1, input, output); + run_is_inf_test(20, 1, 1, input, output, true); // Skip as TRT10.1 supports Float8 but T4 GPU run on TRT CIs doesn't } TEST(IsInfTest, test_Float8E4M3FNUZ) { diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 41b3c47ba0396..a66828ee5e188 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -43,7 +43,7 @@ variables: - name: docker_base_image value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 - name: linux_trt_version - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 - name: Repository value: 'onnxruntimecuda11manylinuxbuild' diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 8b386dde7d3a7..700326fe9173c 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -83,7 +83,7 @@ variables: value: 11.8 - name: win_trt_home - value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 + value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 - name: win_cuda_home value: $(Agent.TempDirectory)\v11.8 diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml index daf95af438d2b..9fd13b513e5fd 100644 --- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml @@ -68,9 +68,9 @@ variables: value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 - name: win_trt_home ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 + value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4 + value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5 - name: win_cuda_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\v11.8 diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index 5f63339fb0d00..3f9707ff50519 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -43,9 +43,9 @@ variables: value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: 10.0.1.6-1.cuda12.4 + value: 10.2.0.19-1.cuda12.5 jobs: - job: Linux_Build diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml index b9a5383836447..56e9c73a10a82 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml @@ -61,7 +61,7 @@ stages: ${{ if eq(parameters.CudaVersion, '12.2') }}: DockerBuildArgs: " --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04 - --build-arg TRT_VERSION=10.0.1.6-1+cuda12.4 + --build-arg TRT_VERSION=10.2.0.19-1+cuda12.5 --build-arg BUILD_UID=$( id -u ) " ${{ else }}: diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index f3604dba1ac9d..593d45361324e 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -226,7 +226,7 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_trt.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml index 63e70fa8e6488..d57a7585f3cff 100644 --- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml @@ -55,7 +55,7 @@ stages: python_wheel_suffix: '_gpu' timeout: 480 docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 - trt_version: '10.0.1.6-1.cuda11.8' + trt_version: '10.2.0.19-1.cuda11.8' cuda_version: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml index b6943f9e1b77b..7dfafeb67acf8 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml @@ -49,9 +49,9 @@ jobs: value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: 10.0.1.6-1.cuda12.4 + value: 10.2.0.19-1.cuda12.5 pool: ${{ parameters.machine_pool }} steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml index cca53e36ebab9..2ca5129ac6e5d 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml @@ -80,9 +80,9 @@ stages: - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: 10.0.1.6-1.cuda12.4 + value: 10.2.0.19-1.cuda12.5 steps: - checkout: self clean: true @@ -149,9 +149,9 @@ stages: value: '12' - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: 10.0.1.6-1.cuda12.4 + value: 10.2.0.19-1.cuda12.5 steps: - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime submodules: false diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml index 01f0337be7714..dcd681bd4b915 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml @@ -65,9 +65,9 @@ stages: SpecificArtifact: ${{ parameters.SpecificArtifact }} BuildId: ${{ parameters.BuildId }} ${{ if eq(parameters.cuda_version, '11.8') }}: - EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ${{ if eq(parameters.cuda_version, '12.2') }}: - EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4 --cuda_home=$(Agent.TempDirectory)\v12.2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5 --cuda_home=$(Agent.TempDirectory)\v12.2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" - ${{ if eq(parameters.enable_linux_gpu, true) }}: - template: ../templates/py-linux-gpu.yml @@ -79,7 +79,7 @@ stages: cuda_version: ${{ parameters.cuda_version }} ${{ if eq(parameters.cuda_version, '11.8') }}: docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 - trt_version: 10.0.1.6-1.cuda11.8 + trt_version: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.cuda_version, '12.2') }}: docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1 - trt_version: 10.0.1.6-1.cuda12.4 + trt_version: 10.2.0.19-1.cuda12.5 diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index 0dd9ffd5282e7..de29a3de9fded 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -13,10 +13,10 @@ parameters: - 12.2 - name: TrtVersion type: string - default: '10.0.1.6' + default: '10.2.0.19' values: - 8.6.1.6 - - 10.0.1.6 + - 10.2.0.19 steps: - ${{ if eq(parameters.DownloadCUDA, true) }}: @@ -42,9 +42,9 @@ steps: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.0.1.6')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.2.0.19')) }}: - powershell: | - Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.4" + Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.5" displayName: Set trtCudaVersion - script: | diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index 6c82958fc0b78..63d521f1e7d9a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -24,17 +24,11 @@ steps: displayName: 'Download Secondary CUDA SDK v${{ parameters.SecondaryCUDAVersion }}' - ${{ if eq(parameters.DownloadTRT, 'true') }}: - powershell: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory) - displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8' + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory) + displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8' - powershell: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0" $(Agent.TempDirectory) - displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0' - - powershell: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory) - displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8' - - powershell: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4" $(Agent.TempDirectory) - displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4' + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5" $(Agent.TempDirectory) + displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5' - task: BatchScript@1 displayName: 'setup env' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml index 97f95797be1f1..6c66cceb33d5c 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml @@ -22,10 +22,10 @@ parameters: - name: trt_version type: string - default: '10.0.1.6-1.cuda11.8' + default: '10.2.0.19-1.cuda11.8' values: - - 10.0.1.6-1.cuda11.8 - - 10.0.1.6-1.cuda12.4 + - 10.2.0.19-1.cuda11.8 + - 10.2.0.19-1.cuda12.5 - name: cuda_version type: string default: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml index 3081624225b12..8eca22c8c123f 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml @@ -18,10 +18,10 @@ parameters: - name: trt_version type: string - default: '10.0.1.6-1.cuda11.8' + default: '10.2.0.19-1.cuda11.8' values: - - 10.0.1.6-1.cuda11.8 - - 10.0.1.6-1.cuda12.4 + - 10.2.0.19-1.cuda11.8 + - 10.2.0.19-1.cuda12.5 - name: cuda_version type: string default: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml index 3f1c4ef0f8d61..47980955b8798 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml @@ -381,7 +381,7 @@ stages: variables: CUDA_VERSION: '11.8' buildArch: x64 - EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80" + EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80" EnvSetupScript: setup_env_gpu.bat EP_NAME: gpu VSGenerator: 'Visual Studio 17 2022' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 9e14789f3b234..27f85dc5c1648 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -288,7 +288,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.8' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -298,7 +298,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.9' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -308,7 +308,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.10' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -318,7 +318,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.11' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -328,7 +328,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.12' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -498,7 +498,7 @@ stages: docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 extra_build_arg: ${{ parameters.build_py_parameters }} cmake_build_type: ${{ parameters.cmake_build_type }} - trt_version: '10.0.1.6-1.cuda11.8' + trt_version: '10.2.0.19-1.cuda11.8' cuda_version: '11.8' - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml index 1af00da01241a..70c0c7d4a04e7 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml @@ -55,7 +55,7 @@ jobs: WithCache: True Today: $(TODAY) AdditionalKey: "gpu-tensorrt | RelWithDebInfo" - BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86' + BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86' MsbuildArguments: $(MsbuildArguments) BuildArch: 'x64' Platform: 'x64' @@ -75,7 +75,7 @@ jobs: del wheel_filename_file python.exe -m pip install -q --upgrade %WHEEL_FILENAME% set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH% - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 + python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' displayName: 'Run tests' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 index 86c178aae519b..2d3dc05285e3c 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -ARG TRT_VERSION=10.0.1.6-1.cuda11.8 +ARG TRT_VERSION=10.2.0.19-1.cuda11.8 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch index 4542d3a3f2e4c..a50788e98ffe0 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -ARG TRT_VERSION=10.0.1.6-1.cuda11.8 +ARG TRT_VERSION=10.2.0.19-1.cuda11.8 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 5ef56fd885ca7..1aca3e305452d 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -ARG TRT_VERSION=10.0.1.6-1+cuda11.8 +ARG TRT_VERSION=10.2.0.19-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg index 194a22850030c..5697120a48b2b 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -ARG TRT_VERSION=10.0.1.6-1+cuda11.8 +ARG TRT_VERSION=10.2.0.19-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 deleted file mode 100644 index 8b32425afce1c..0000000000000 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 +++ /dev/null @@ -1,63 +0,0 @@ -# -------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------- -# Dockerfile to run ONNXRuntime with TensorRT integration - -FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04 - - -# ONNX Runtime Variables -ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime -ARG ONNXRUNTIME_BRANCH=main -ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80 - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH} - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update &&\ - apt-get install -y sudo git bash unattended-upgrades wget -RUN unattended-upgrade - -# Install python3 -RUN apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - python3-dev \ - python3-wheel &&\ - cd /usr/local/bin &&\ - ln -s /usr/bin/python3 python &&\ - ln -s /usr/bin/pip3 pip; - -RUN pip install --upgrade pip -RUN pip install setuptools>=68.2.2 - -# Install TensorRT -RUN v="8.4.1-1+cuda11.6" &&\ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ - apt-get update &&\ - sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ - libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ - python3-libnvinfer=${v} libnvinfer-samples=${v} - -# Compile trtexec -RUN cd /usr/src/tensorrt/samples/trtexec && make - -# Install Valgrind -RUN apt-get install -y valgrind - -ARG BUILD_USER=onnxruntimedev -ARG BUILD_UID=1000 -RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID -USER $BUILD_USER -WORKDIR /code -ENV CUDA_MODULE_LOADING "LAZY" - -# Prepare onnxruntime repository & build onnxruntime with TensorRT -RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\ - /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\ - cd onnxruntime &&\ - /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' &&\ - pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\ - cd .. diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 deleted file mode 100644 index cfc7023ef8e61..0000000000000 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 +++ /dev/null @@ -1,92 +0,0 @@ -# -------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------- -# Dockerfile to run ONNXRuntime with TensorRT integration - -# Build base image with required system packages -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base - -# The local directory into which to build and install CMAKE -ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH} -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update &&\ - apt-get install -y sudo git bash unattended-upgrades wget -RUN unattended-upgrade - -# Install python3 -RUN apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - python3-dev \ - python3-wheel &&\ - cd /usr/local/bin &&\ - ln -s /usr/bin/python3 python &&\ - ln -s /usr/bin/pip3 pip; - -RUN pip install --upgrade pip -RUN pip install setuptools>=68.2.2 - -# Install TensorRT -RUN v="8.5.1-1+cuda11.8" &&\ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ - apt-get update &&\ - sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ - libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ - python3-libnvinfer=${v} libnvinfer-samples=${v} - -# Compile trtexec -RUN cd /usr/src/tensorrt/samples/trtexec && make - -# Install Valgrind -RUN apt-get install -y valgrind - -# Build final image from base. Builds ORT. -FROM base as final -ARG BUILD_USER=onnxruntimedev -ARG BUILD_UID=1000 -RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID -USER $BUILD_USER - -# ONNX Runtime arguments - -# URL to the github repo from which to clone ORT. -ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime - -# The local directory into which to clone ORT. -ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code - -# The git branch of ORT to checkout and build. -ARG ONNXRUNTIME_BRANCH=main - -# Optional. The specific commit to pull and build from. If not set, the latest commit is used. -ARG ONNXRUNTIME_COMMIT_ID - -# The supported CUDA architecture -ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80 - -WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR} - -# Clone ORT repository with branch -RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\ - /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh - -WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime - -# Reset to a specific commit if specified by build args. -RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\ - else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\ - git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi - -# Build ORT -ENV CUDA_MODULE_LOADING "LAZY" -RUN /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' - -# Switch to root to continue following steps of CI -USER root - -# Intall ORT wheel -RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl \ No newline at end of file diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 similarity index 99% rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 index cd168e1911d95..0bd56a1a5873f 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 @@ -31,7 +31,7 @@ RUN pip install --upgrade pip RUN pip install psutil setuptools>=68.2.2 # Install TensorRT -RUN version="10.0.1.6-1+cuda11.8" &&\ +RUN version="10.2.0.19-1+cuda11.8" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 similarity index 83% rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 index 3e48415118c63..7f66943dd8745 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 @@ -5,7 +5,7 @@ # Dockerfile to run ONNXRuntime with TensorRT integration # Build base image with required system packages -FROM nvidia/cuda:12.4.1-devel-ubuntu20.04 AS base +FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base # The local directory into which to build and install CMAKE ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code @@ -30,15 +30,27 @@ RUN apt-get install -y --no-install-recommends \ RUN pip install --upgrade pip RUN pip install setuptools>=68.2.2 psutil -# Install cuDNN v9 -RUN apt-get -y install cudnn9-cuda-12 - # Install TensorRT -RUN version="10.0.1.6-1+cuda12.4" &&\ +RUN version="10.2.0.19-1+cuda12.5" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ - tensorrt=${version} + libnvinfer-dev=${version} \ + libnvinfer-dispatch-dev=${version} \ + libnvinfer-dispatch10=${version} \ + libnvinfer-headers-dev=${version} \ + libnvinfer-headers-plugin-dev=${version} \ + libnvinfer-lean-dev=${version} \ + libnvinfer-lean10=${version} \ + libnvinfer-plugin-dev=${version} \ + libnvinfer-plugin10=${version} \ + libnvinfer-vc-plugin-dev=${version} \ + libnvinfer-vc-plugin10=${version} \ + libnvinfer10=${version} \ + libnvonnxparsers-dev=${version} \ + libnvonnxparsers10=${version} \ + tensorrt-dev=${version} \ + libnvinfer-bin=${version} # Compile trtexec if not installed RUN if [ ! -d /usr/src/tensorrt/bin ] || [ ! -f /usr/src/tensorrt/bin/trtexec ]; then \ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin index a26bf88fbbdf6..0281c1c8fef25 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin @@ -5,7 +5,7 @@ # Dockerfile to run ONNXRuntime with TensorRT installed from provided binaries # Build base image with required system packages -FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base +FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base # The local directory into which to build and install CMAKE ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code @@ -30,9 +30,6 @@ RUN apt-get install -y --no-install-recommends \ RUN pip install --upgrade pip RUN pip install setuptools>=68.2.2 -# Install cuDNN v9 -RUN apt-get -y install cudnn9-cuda-12 - # Install TensorRT # Must provide version numbers used to build the name of the tar file containing TensorRT binaries. # See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile index 3a7f410d3859e..a0020a9827290 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 FROM $BASEIMAGE -ARG TRT_VERSION=10.0.1.6-1.cuda11.8 +ARG TRT_VERSION=10.2.0.19-1.cuda11.8 #Install TensorRT only if TRT_VERSION is not empty RUN if [ -n "${TRT_VERSION}" ]; then \ diff --git a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt index b8c00a610b781..6ece3c1f92c4e 100644 --- a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt +++ b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt @@ -1,2 +1,2 @@ -azure-kusto-data[pandas]==3.0.1 -azure-kusto-ingest[pandas]==3.0.1 +azure-kusto-data[pandas]==4.5.1 +azure-kusto-ingest[pandas]==4.5.1 diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index b753cdae16b90..6c59866ea925a 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( ) else ( set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH% ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH% @REM The default version is still cuda v11.8, because set cuda v12.2 after it -set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4\lib +set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64 ) else ( diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 4e43b5999a315..249bb98815897 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( ) else ( set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64 ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false set CUDA_MODULE_LOADING=LAZY \ No newline at end of file