-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Use native sqrt when flushing denorm is allowed #114173
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Ruiling, Song (ruiling) ChangesPatch is 70.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114173.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 52ca38aca5c771..303defc77cdfb8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11080,7 +11080,8 @@ SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getValueType().getSimpleVT();
const SDValue X = Op.getOperand(0);
- if (allowApproxFunc(DAG, Flags)) {
+ if (allowApproxFunc(DAG, Flags) ||
+ denormalModeIsFlushAllF32(DAG.getMachineFunction())) {
// Instruction is 1ulp but ignores denormals.
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, VT,
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 2140f50611d711..2313ae21f752a3 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -442,23 +442,7 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
; CODEGEN-DAZ-SDAG: ; %bb.0:
; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CODEGEN-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
-; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; CODEGEN-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; CODEGEN-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; CODEGEN-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
; CODEGEN-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1
; CODEGEN-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -512,23 +496,7 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
; IR-DAZ-SDAG: ; %bb.0:
; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1
; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -1086,23 +1054,7 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
; CODEGEN-DAZ-SDAG: ; %bb.0:
; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CODEGEN-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
-; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
-; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; CODEGEN-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; CODEGEN-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; CODEGEN-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0
; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1132,23 +1084,7 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
; IR-DAZ-SDAG: ; %bb.0:
; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0
; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1714,23 +1650,7 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
; IR-DAZ-SDAG: ; %bb.0:
; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1
; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index c6c145e090829c..c7494f94b21116 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -55,23 +55,7 @@ define float @v_sqrt_f32(float %x) {
; SDAG-DAZ-LABEL: v_sqrt_f32:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-DAZ-LABEL: v_sqrt_f32:
@@ -150,23 +134,7 @@ define float @v_sqrt_f32_fneg(float %x) {
; SDAG-DAZ-LABEL: v_sqrt_f32_fneg:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x8f800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
-; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0
; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-DAZ-LABEL: v_sqrt_f32_fneg:
@@ -248,24 +216,7 @@ define float @v_sqrt_f32_fabs(float %x) {
; SDAG-DAZ-LABEL: v_sqrt_f32_fabs:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT: s_mov_b32 s5, 0x4f800000
-; SDAG-DAZ-NEXT: v_mul_f32_e64 v1, |v0|, s5
-; SDAG-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v0, |v0|, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e64 v0, |v0|
; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-DAZ-LABEL: v_sqrt_f32_fabs:
@@ -347,24 +298,7 @@ define float @v_sqrt_f32_fneg_fabs(float %x) {
; SDAG-DAZ-LABEL: v_sqrt_f32_fneg_fabs:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x8f800000
-; SDAG-DAZ-NEXT: s_mov_b32 s5, 0xcf800000
-; SDAG-DAZ-NEXT: v_mul_f32_e64 v1, |v0|, s5
-; SDAG-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4
-; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v0, -|v0|, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e64 v0, -|v0|
; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-DAZ-LABEL: v_sqrt_f32_fneg_fabs:
@@ -445,23 +379,7 @@ define float @v_sqrt_f32_ninf(float %x) {
; SDAG-DAZ-LABEL: v_sqrt_f32_ninf:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-DAZ-LABEL: v_sqrt_f32_ninf:
@@ -539,23 +457,7 @@ define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
; SDAG-DAZ-LABEL: v_sqrt_f32_no_infs_attribute:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-DAZ-LABEL: v_sqrt_f32_no_infs_attribute:
@@ -633,23 +535,7 @@ define float @v_sqrt_f32_nnan(float %x) {
; SDAG-DAZ-LABEL: v_sqrt_f32_nnan:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-DAZ-LABEL: v_sqrt_f32_nnan:
@@ -730,25 +616,7 @@ define amdgpu_ps i32 @s_sqrt_f32(float inreg %x) {
;
; SDAG-DAZ-LABEL: s_sqrt_f32:
; SDAG-DAZ: ; %bb.0:
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, s0
; SDAG-DAZ-NEXT: v_readfirstlane_b32 s0, v0
; SDAG-DAZ-NEXT: ; return to shader part epilog
;
@@ -834,25 +702,7 @@ define amdgpu_ps i32 @s_sqrt_f32_ninf(float inreg %x) {
;
; SDAG-DAZ-LABEL: s_sqrt_f32_ninf:
; SDAG-DAZ: ; %bb.0:
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, s0
; SDAG-DAZ-NEXT: v_readfirstlane_b32 s0, v0
; SDAG-DAZ-NEXT: ; return to shader part epilog
;
@@ -959,23 +809,7 @@ define float @v_sqrt_f32_nsz(float %x) {
; SDAG-DAZ-LABEL: v_sqrt_f32_nsz:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-DAZ-LABEL: v_sqrt_f32_nsz:
@@ -1053,23 +887,7 @@ define float @v_sqrt_f32_nnan_ninf(float %x) {
; SDAG-DAZ-LABEL: v_sqrt_f32_nnan_ninf:
; SDAG-DAZ: ; %bb.0:
; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-D...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pretty sure this is wrong and won't pass OpenCL conformance with FTZ on. The sqrt instruction is still 1ulp, not 0.5 ulp. We should already account for denormal mode and !fpmath metadata in AMDGPUCodeGenPrepare
Thanks for the feedback @arsenm, I did not take close look the real issue before. I have a look the code in AMDGPUCodeGenPrepare, we did not set the fast math flag properly for the case. But it sounds surprising OpenCL conformance needs correctly rounded sqrt. I think it requires 3ulp for sqrt? |
The default requires 2.5 ulp, which is indicated by !fpmath metadata. You can supply the -cl-fp32-correctly-rounded-divide-sqrt flag to request correctly rounded (which is also the expectation in OpenMP, CUDA and HIP) |
No description provided.