Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Use native sqrt when flushing denorm is allowed #114173

Closed
wants to merge 1 commit into from

Conversation

ruiling
Copy link
Contributor

@ruiling ruiling commented Oct 30, 2024

No description provided.

@llvmbot
Copy link
Collaborator

llvmbot commented Oct 30, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Ruiling, Song (ruiling)

Changes

Patch is 70.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114173.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+2-1)
  • (modified) llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll (+5-85)
  • (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll (+69-566)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll (+1-8)
  • (modified) llvm/test/CodeGen/AMDGPU/rsq.f32.ll (+25-293)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 52ca38aca5c771..303defc77cdfb8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11080,7 +11080,8 @@ SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op.getValueType().getSimpleVT();
   const SDValue X = Op.getOperand(0);
 
-  if (allowApproxFunc(DAG, Flags)) {
+  if (allowApproxFunc(DAG, Flags) ||
+      denormalModeIsFlushAllF32(DAG.getMachineFunction())) {
     // Instruction is 1ulp but ignores denormals.
     return DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, DL, VT,
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 2140f50611d711..2313ae21f752a3 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -442,23 +442,7 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
 ; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CODEGEN-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
-; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CODEGEN-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CODEGEN-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v1, v0
-; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, v0, v1
-; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v2, v2, v3, v2
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v1, v1, v3, v1
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v1, v4, v1, v2
-; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CODEGEN-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CODEGEN-DAZ-SDAG-NEXT:    v_mov_b32_e32 v2, 0x260
-; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CODEGEN-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CODEGEN-DAZ-SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CODEGEN-DAZ-SDAG-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; CODEGEN-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v2, v1
 ; CODEGEN-DAZ-SDAG-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -512,23 +496,7 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
 ; IR-DAZ-SDAG:       ; %bb.0:
 ; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; IR-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v1, v0
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v1, v4, v1, v2
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-SDAG-NEXT:    v_mov_b32_e32 v2, 0x260
-; IR-DAZ-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-DAZ-SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-DAZ-SDAG-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IR-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v2, v1
 ; IR-DAZ-SDAG-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -1086,23 +1054,7 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
 ; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CODEGEN-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
-; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CODEGEN-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CODEGEN-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v1, v0
-; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, v0, v1
-; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v2, v2, v3, v2
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v1, v1, v3, v1
-; CODEGEN-DAZ-SDAG-NEXT:    v_fma_f32 v1, v4, v1, v2
-; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CODEGEN-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CODEGEN-DAZ-SDAG-NEXT:    v_mov_b32_e32 v2, 0x260
-; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CODEGEN-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CODEGEN-DAZ-SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CODEGEN-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v0, v0
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1132,23 +1084,7 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; IR-DAZ-SDAG:       ; %bb.0:
 ; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; IR-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v1, v0
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v1, v4, v1, v2
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-SDAG-NEXT:    v_mov_b32_e32 v2, 0x260
-; IR-DAZ-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-DAZ-SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v0, v0
 ; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1714,23 +1650,7 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
 ; IR-DAZ-SDAG:       ; %bb.0:
 ; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; IR-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v1, v0
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-SDAG-NEXT:    v_fma_f32 v1, v4, v1, v2
-; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-SDAG-NEXT:    v_mov_b32_e32 v2, 0x260
-; IR-DAZ-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-DAZ-SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-DAZ-SDAG-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IR-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v2, v1
 ; IR-DAZ-SDAG-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index c6c145e090829c..c7494f94b21116 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -55,23 +55,7 @@ define float @v_sqrt_f32(float %x) {
 ; SDAG-DAZ-LABEL: v_sqrt_f32:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32:
@@ -150,23 +134,7 @@ define float @v_sqrt_f32_fneg(float %x) {
 ; SDAG-DAZ-LABEL: v_sqrt_f32_fneg:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x8f800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; SDAG-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e64 v0, -v0
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_fneg:
@@ -248,24 +216,7 @@ define float @v_sqrt_f32_fabs(float %x) {
 ; SDAG-DAZ-LABEL: v_sqrt_f32_fabs:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT:    s_mov_b32 s5, 0x4f800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e64 v1, |v0|, s5
-; SDAG-DAZ-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e64 v0, |v0|, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e64 v0, |v0|
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_fabs:
@@ -347,24 +298,7 @@ define float @v_sqrt_f32_fneg_fabs(float %x) {
 ; SDAG-DAZ-LABEL: v_sqrt_f32_fneg_fabs:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x8f800000
-; SDAG-DAZ-NEXT:    s_mov_b32 s5, 0xcf800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e64 v1, |v0|, s5
-; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e64 v0, -|v0|, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e64 v0, -|v0|
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_fneg_fabs:
@@ -445,23 +379,7 @@ define float @v_sqrt_f32_ninf(float %x) {
 ; SDAG-DAZ-LABEL: v_sqrt_f32_ninf:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_ninf:
@@ -539,23 +457,7 @@ define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
 ; SDAG-DAZ-LABEL: v_sqrt_f32_no_infs_attribute:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_no_infs_attribute:
@@ -633,23 +535,7 @@ define float @v_sqrt_f32_nnan(float %x) {
 ; SDAG-DAZ-LABEL: v_sqrt_f32_nnan:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_nnan:
@@ -730,25 +616,7 @@ define amdgpu_ps i32 @s_sqrt_f32(float inreg %x) {
 ;
 ; SDAG-DAZ-LABEL: s_sqrt_f32:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v0, 0xf800000
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, s0, v1
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, s0
-; SDAG-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, s0
 ; SDAG-DAZ-NEXT:    v_readfirstlane_b32 s0, v0
 ; SDAG-DAZ-NEXT:    ; return to shader part epilog
 ;
@@ -834,25 +702,7 @@ define amdgpu_ps i32 @s_sqrt_f32_ninf(float inreg %x) {
 ;
 ; SDAG-DAZ-LABEL: s_sqrt_f32_ninf:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v0, 0xf800000
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, s0, v1
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, s0
-; SDAG-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, s0
 ; SDAG-DAZ-NEXT:    v_readfirstlane_b32 s0, v0
 ; SDAG-DAZ-NEXT:    ; return to shader part epilog
 ;
@@ -959,23 +809,7 @@ define float @v_sqrt_f32_nsz(float %x) {
 ; SDAG-DAZ-LABEL: v_sqrt_f32_nsz:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_nsz:
@@ -1053,23 +887,7 @@ define float @v_sqrt_f32_nnan_ninf(float %x) {
 ; SDAG-DAZ-LABEL: v_sqrt_f32_nnan_ninf:
 ; SDAG-DAZ:       ; %bb.0:
 ; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-D...
[truncated]

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty sure this is wrong and won't pass OpenCL conformance with FTZ on. The sqrt instruction is still 1ulp, not 0.5 ulp. We should already account for denormal mode and !fpmath metadata in AMDGPUCodeGenPrepare

@ruiling
Copy link
Contributor Author

ruiling commented Oct 30, 2024

Pretty sure this is wrong and won't pass OpenCL conformance with FTZ on. The sqrt instruction is still 1ulp, not 0.5 ulp. We should already account for denormal mode and !fpmath metadata in AMDGPUCodeGenPrepare

Thanks for the feedback @arsenm, I did not take close look the real issue before. I have a look the code in AMDGPUCodeGenPrepare, we did not set the fast math flag properly for the case. But it sounds surprising OpenCL conformance needs correctly rounded sqrt. I think it requires 3ulp for sqrt?

@ruiling ruiling closed this Oct 30, 2024
@arsenm
Copy link
Contributor

arsenm commented Oct 30, 2024

Thanks for the feedback @arsenm, I did not take close look the real issue before. I have a look the code in AMDGPUCodeGenPrepare, we did not set the fast math flag properly for the case. But it sounds surprising OpenCL conformance needs correctly rounded sqrt. I think it requires 3ulp for sqrt?

The default requires 2.5 ulp, which is indicated by !fpmath metadata. You can supply the -cl-fp32-correctly-rounded-divide-sqrt flag to request correctly rounded (which is also the expectation in OpenMP, CUDA and HIP)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants