From d4e46f0e864e37085da0c5e56e4f6f278e2f7aee Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 11 Jul 2024 09:18:50 +0100 Subject: [PATCH] [AMDGPU] Fix machine verification failure from INIT_EXEC lowering (#98333) Fix machine verification failure from INIT_EXEC lowering since it was moved from SILowerControlFlow to SIWholeQuadMode in #94452. --- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 4 +- llvm/test/CodeGen/AMDGPU/wqm.ll | 75 ++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 9f064493f50475..ae91cb31590cfd 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1676,6 +1676,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() && KillInstrs.empty()) { lowerLiveMaskQueries(); + if (!InitExecInstrs.empty()) + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); return !InitExecInstrs.empty() || !LiveMaskQueries.empty(); } @@ -1717,7 +1719,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC); // If we performed any kills then recompute EXEC - if (!KillInstrs.empty()) + if (!KillInstrs.empty() || !InitExecInstrs.empty()) LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); return true; diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index c621904ff727be..11003c4c9edfdd 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -3463,6 +3463,81 @@ bb: ret void } +; Test a case that failed machine verification. +define amdgpu_gs void @wqm_init_exec_switch(i32 %arg) { +; GFX9-W64-LABEL: wqm_init_exec_switch: +; GFX9-W64: ; %bb.0: +; GFX9-W64-NEXT: s_mov_b64 exec, 0 +; GFX9-W64-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: wqm_init_exec_switch: +; GFX10-W32: ; %bb.0: +; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: v_cmpx_lt_i32_e32 0, v0 +; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 +; GFX10-W32-NEXT: s_endpgm + call void @llvm.amdgcn.init.exec(i64 0) + switch i32 %arg, label %bb1 [ + i32 0, label %bb3 + i32 1, label %bb2 + ] +bb1: + ret void +bb2: + ret void +bb3: + ret void +} + +define amdgpu_gs void @wqm_init_exec_wwm() { +; GFX9-W64-LABEL: wqm_init_exec_wwm: +; GFX9-W64: ; %bb.0: +; GFX9-W64-NEXT: s_mov_b64 exec, 0 +; GFX9-W64-NEXT: s_mov_b32 s1, 0 +; GFX9-W64-NEXT: s_mov_b32 s0, s1 +; GFX9-W64-NEXT: s_cmp_lg_u64 exec, 0 +; GFX9-W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-W64-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-W64-NEXT: exp mrt0 off, off, off, off +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: wqm_init_exec_wwm: +; GFX10-W32: ; %bb.0: +; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, 0 +; GFX10-W32-NEXT: s_cmp_lg_u64 exec, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, s1 +; GFX10-W32-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-W32-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-W32-NEXT: s_cselect_b32 s0, -1, 0 +; GFX10-W32-NEXT: s_xor_b32 s0, s2, s0 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-W32-NEXT: exp mrt0 off, off, off, off +; GFX10-W32-NEXT: s_endpgm + call void @llvm.amdgcn.init.exec(i64 0) + %i = call i64 @llvm.amdgcn.ballot.i64(i1 true) + %i1 = call i32 @llvm.amdgcn.wwm.i32(i32 0) + %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 0 + %i3 = bitcast <2 x i32> %i2 to i64 + %i4 = icmp ne i64 %i, 0 + %i5 = icmp ne i64 %i3, 0 + %i6 = xor i1 %i4, %i5 + %i7 = uitofp i1 %i6 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %i7, float 0.0, float 0.0, float 0.0, i1 false, i1 false) + ret void +} + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1