diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6d5ffc66d98b257..b7471bab128509d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2775,18 +2775,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); return; - case Intrinsic::amdgcn_inverse_ballot: - switch (N->getOperand(1).getValueSizeInBits()) { - case 32: - Opcode = AMDGPU::S_INVERSE_BALLOT_U32; - break; - case 64: - Opcode = AMDGPU::S_INVERSE_BALLOT_U64; - break; - default: - llvm_unreachable("Unsupported size for inverse ballot mask."); - } - break; default: SelectCode(N); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index dcb0f47973c4a80..da3e8c0a62b0894 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1055,8 +1055,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return selectIntrinsicCmp(I); case Intrinsic::amdgcn_ballot: return selectBallot(I); - case Intrinsic::amdgcn_inverse_ballot: - return selectInverseBallot(I); case Intrinsic::amdgcn_reloc_constant: return selectRelocConstant(I); case Intrinsic::amdgcn_groupstaticsize: @@ -1449,17 +1447,6 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - const DebugLoc &DL = I.getDebugLoc(); - const Register DstReg = I.getOperand(0).getReg(); - const Register MaskReg = I.getOperand(2).getReg(); - - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg); - I.eraseFromParent(); - return true; -} - bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 2d3317e04ce126a..43ed210508d3316 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -112,7 +112,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicCmp(MachineInstr &MI) const; bool selectBallot(MachineInstr &I) const; - bool selectInverseBallot(MachineInstr &I) const; bool selectRelocConstant(MachineInstr &I) const; bool selectGroupStaticSize(MachineInstr &I) const; bool selectReturnAddress(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b68962e0541ce41..d5ffb4478bee150 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5480,24 +5480,11 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return BB; } case AMDGPU::S_INVERSE_BALLOT_U32: - case AMDGPU::S_INVERSE_BALLOT_U64: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const GCNSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const DebugLoc &DL = MI.getDebugLoc(); - const Register DstReg = MI.getOperand(0).getReg(); - Register MaskReg = MI.getOperand(1).getReg(); - - const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg); - - if (IsVALU) { - MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI); - } - - BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg); - MI.eraseFromParent(); + case AMDGPU::S_INVERSE_BALLOT_U64: + // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if + // necessary. After that they are equivalent to a COPY. + MI.setDesc(TII->get(AMDGPU::COPY)); return BB; - } case AMDGPU::ENDPGM_TRAP: { const DebugLoc &DL = MI.getDebugLoc(); if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7f7b7c447204258..52044791e6c666f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6686,7 +6686,9 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, MI.getOpcode() == AMDGPU::S_QUADMASK_B32 || MI.getOpcode() == AMDGPU::S_QUADMASK_B64 || MI.getOpcode() == AMDGPU::S_WQM_B32 || - MI.getOpcode() == AMDGPU::S_WQM_B64) { + MI.getOpcode() == AMDGPU::S_WQM_B64 || + MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 || + MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) { MachineOperand &Src = MI.getOperand(1); if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 77b17a0f2789b53..f2721fbd164bff6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -212,9 +212,15 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { } let usesCustomInserter = 1 in { -def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$mask)>; +def S_INVERSE_BALLOT_U32 : SPseudoInstSI< + (outs SReg_32:$sdst), (ins SSrc_b32:$mask), + [(set i1:$sdst, (int_amdgcn_inverse_ballot i32:$mask))] +>; -def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>; +def S_INVERSE_BALLOT_U64 : SPseudoInstSI< + (outs SReg_64:$sdst), (ins SSrc_b64:$mask), + [(set i1:$sdst, (int_amdgcn_inverse_ballot i64:$mask))] +>; } // End usesCustomInserter = 1 // Pseudo instructions used for @llvm.fptrunc.round upward