-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Move check if any vector insts will be generated to VPlan. #96622
Changes from 10 commits
82f5e6a
0789f5d
9951ec2
2099a8f
a0d5050
1cce5c4
c321381
cc437d8
a5128c5
7835ae0
3b78232
da1ae93
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -1090,7 +1090,7 @@ class LoopVectorizationCostModel { | |||||||||
bool selectUserVectorizationFactor(ElementCount UserVF) { | ||||||||||
collectUniformsAndScalars(UserVF); | ||||||||||
collectInstsToScalarize(UserVF); | ||||||||||
return expectedCost(UserVF).first.isValid(); | ||||||||||
return expectedCost(UserVF).isValid(); | ||||||||||
} | ||||||||||
|
||||||||||
/// \return The size (in bits) of the smallest and widest types in the code | ||||||||||
|
@@ -1591,20 +1591,13 @@ class LoopVectorizationCostModel { | |||||||||
Scalars.clear(); | ||||||||||
} | ||||||||||
|
||||||||||
/// The vectorization cost is a combination of the cost itself and a boolean | ||||||||||
/// indicating whether any of the contributing operations will actually | ||||||||||
/// operate on vector values after type legalization in the backend. If this | ||||||||||
/// latter value is false, then all operations will be scalarized (i.e. no | ||||||||||
/// vectorization has actually taken place). | ||||||||||
using VectorizationCostTy = std::pair<InstructionCost, bool>; | ||||||||||
|
||||||||||
/// Returns the expected execution cost. The unit of the cost does | ||||||||||
/// not matter because we use the 'cost' units to compare different | ||||||||||
/// vector widths. The cost that is returned is *not* normalized by | ||||||||||
/// the factor width. If \p Invalid is not nullptr, this function | ||||||||||
/// will add a pair(Instruction*, ElementCount) to \p Invalid for | ||||||||||
/// each instruction that has an Invalid cost for the given VF. | ||||||||||
VectorizationCostTy | ||||||||||
InstructionCost | ||||||||||
expectedCost(ElementCount VF, | ||||||||||
SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); | ||||||||||
|
||||||||||
|
@@ -1642,12 +1635,7 @@ class LoopVectorizationCostModel { | |||||||||
|
||||||||||
/// Returns the execution time cost of an instruction for a given vector | ||||||||||
/// width. Vector width of one means scalar. | ||||||||||
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); | ||||||||||
|
||||||||||
/// The cost-computation logic from getInstructionCost which provides | ||||||||||
/// the vector type as an output parameter. | ||||||||||
InstructionCost getInstructionCost(Instruction *I, ElementCount VF, | ||||||||||
Type *&VectorTy); | ||||||||||
InstructionCost getInstructionCost(Instruction *I, ElementCount VF); | ||||||||||
|
||||||||||
/// Return the cost of instructions in an inloop reduction pattern, if I is | ||||||||||
/// part of that pattern. | ||||||||||
|
@@ -4795,9 +4783,101 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, | |||||||||
} while (!Tail.empty()); | ||||||||||
} | ||||||||||
|
||||||||||
/// Check if any recipe of \p Plan will generate a vector value, which will be | ||||||||||
/// assigned a vector register. | ||||||||||
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, | ||||||||||
const TargetTransformInfo &TTI) { | ||||||||||
assert(VF.isVector() && "Checking a scalar VF?"); | ||||||||||
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), | ||||||||||
Plan.getCanonicalIV()->getScalarType()->getContext()); | ||||||||||
// Set of types known to not generate vector values. | ||||||||||
DenseSet<Type *> WillNotWiden; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed, thanks! |
||||||||||
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( | ||||||||||
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suffice to traverse shallow - inner regions replicate scalar values only - stopping at vector loop region's exit block? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now yes I think, as replicate regions won't contain any widen recipes. |
||||||||||
for (VPRecipeBase &R : *VPBB) { | ||||||||||
// Continue early if the recipe is considered to not produce a vector | ||||||||||
// result. Note that this includes VPInstruction where some opcodes may | ||||||||||
// produce a vector, to preserve existing behavior as VPInstructions model | ||||||||||
// aspects not directly mapped to existing IR instructions. | ||||||||||
switch (R.getVPDefID()) { | ||||||||||
case VPDef::VPDerivedIVSC: | ||||||||||
case VPDef::VPScalarIVStepsSC: | ||||||||||
case VPDef::VPScalarCastSC: | ||||||||||
case VPDef::VPReplicateSC: | ||||||||||
case VPDef::VPInstructionSC: | ||||||||||
case VPDef::VPCanonicalIVPHISC: | ||||||||||
case VPDef::VPVectorPointerSC: | ||||||||||
case VPDef::VPExpandSCEVSC: | ||||||||||
case VPDef::VPEVLBasedIVPHISC: | ||||||||||
case VPDef::VPPredInstPHISC: | ||||||||||
case VPDef::VPBranchOnMaskSC: | ||||||||||
continue; | ||||||||||
case VPDef::VPReductionSC: | ||||||||||
case VPDef::VPActiveLaneMaskPHISC: | ||||||||||
case VPDef::VPWidenCallSC: | ||||||||||
case VPDef::VPWidenCanonicalIVSC: | ||||||||||
case VPDef::VPWidenCastSC: | ||||||||||
case VPDef::VPWidenGEPSC: | ||||||||||
case VPDef::VPWidenSC: | ||||||||||
case VPDef::VPWidenSelectSC: | ||||||||||
case VPDef::VPBlendSC: | ||||||||||
case VPDef::VPFirstOrderRecurrencePHISC: | ||||||||||
case VPDef::VPWidenPHISC: | ||||||||||
case VPDef::VPWidenIntOrFpInductionSC: | ||||||||||
case VPDef::VPWidenPointerInductionSC: | ||||||||||
case VPDef::VPReductionPHISC: | ||||||||||
case VPDef::VPInterleaveSC: | ||||||||||
case VPDef::VPWidenLoadEVLSC: | ||||||||||
case VPDef::VPWidenLoadSC: | ||||||||||
case VPDef::VPWidenStoreEVLSC: | ||||||||||
case VPDef::VPWidenStoreSC: | ||||||||||
break; | ||||||||||
default: | ||||||||||
llvm_unreachable("unhandled recipe"); | ||||||||||
} | ||||||||||
|
||||||||||
auto WillWiden = [&TTI, VF](Type *ScalarTy) { | ||||||||||
Type *VectorTy = ToVectorTy(ScalarTy, VF); | ||||||||||
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); | ||||||||||
if (!NumLegalParts) | ||||||||||
return false; | ||||||||||
if (VF.isScalable()) { | ||||||||||
// <vscale x 1 x iN> is assumed to be profitable over iN because | ||||||||||
// scalable registers are a distinct register class from scalar | ||||||||||
// ones. If we ever find a target which wants to lower scalable | ||||||||||
// vectors back to scalars, we'll need to update this code to | ||||||||||
// explicitly ask TTI about the register class uses for each part. | ||||||||||
return NumLegalParts <= VF.getKnownMinValue(); | ||||||||||
} | ||||||||||
// Two or more parts that share a register - are vectorized. | ||||||||||
return NumLegalParts < VF.getKnownMinValue(); | ||||||||||
}; | ||||||||||
|
||||||||||
// If no def nor is a store, e.g., branches, continue - no value to check. | ||||||||||
if (R.getNumDefinedValues() == 0 && | ||||||||||
!isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( | ||||||||||
&R)) | ||||||||||
continue; | ||||||||||
// For multi-def recipes, currently only interleaved loads, suffice to | ||||||||||
// check first def only. | ||||||||||
// For stores check their stored value; for interleaved stores suffice | ||||||||||
// the check first stored value only. In all cases this is the second | ||||||||||
// operand. | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, the stored widened values are presumably defined earlier, inside the vector loop, so would it suffice to check the types of widened values defined by widening recipes only, and ignore checking those used by stores? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that would potentially miss cases where we have wide stores of loop-invariant values only, left as is for now.! |
||||||||||
VPValue *ToCheck = | ||||||||||
R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1); | ||||||||||
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); | ||||||||||
if (!WillNotWiden.insert({ScalarTy}).second) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inserting before checking will include will-not-widen types along with the first will-widen type (if one exists), so slightly more accurate to call the cache
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed, thanks! |
||||||||||
continue; | ||||||||||
if (WillWiden(ScalarTy)) | ||||||||||
return true; | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
return false; | ||||||||||
} | ||||||||||
|
||||||||||
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { | ||||||||||
InstructionCost ExpectedCost = | ||||||||||
CM.expectedCost(ElementCount::getFixed(1)).first; | ||||||||||
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); | ||||||||||
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); | ||||||||||
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); | ||||||||||
assert(any_of(VPlans, | ||||||||||
|
@@ -4826,9 +4906,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { | |||||||||
if (VF.isScalar()) | ||||||||||
continue; | ||||||||||
|
||||||||||
LoopVectorizationCostModel::VectorizationCostTy C = | ||||||||||
CM.expectedCost(VF, &InvalidCosts); | ||||||||||
VectorizationFactor Candidate(VF, C.first, ScalarCost.ScalarCost); | ||||||||||
InstructionCost C = CM.expectedCost(VF, &InvalidCosts); | ||||||||||
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); | ||||||||||
|
||||||||||
#ifndef NDEBUG | ||||||||||
unsigned AssumedMinimumVscale = | ||||||||||
|
@@ -4845,7 +4924,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { | |||||||||
LLVM_DEBUG(dbgs() << ".\n"); | ||||||||||
#endif | ||||||||||
|
||||||||||
if (!C.second && !ForceVectorization) { | ||||||||||
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { | ||||||||||
LLVM_DEBUG( | ||||||||||
dbgs() | ||||||||||
<< "LV: Not considering vector loop of width " << VF | ||||||||||
|
@@ -5146,7 +5225,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, | |||||||||
// If we did not calculate the cost for VF (because the user selected the VF) | ||||||||||
// then we calculate the cost of VF here. | ||||||||||
if (LoopCost == 0) { | ||||||||||
LoopCost = expectedCost(VF).first; | ||||||||||
LoopCost = expectedCost(VF); | ||||||||||
assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); | ||||||||||
|
||||||||||
// Loop body is free and there is no need for interleaving. | ||||||||||
|
@@ -5717,15 +5796,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( | |||||||||
|
||||||||||
// Compute the cost of the vector instruction. Note that this cost already | ||||||||||
// includes the scalarization overhead of the predicated instruction. | ||||||||||
InstructionCost VectorCost = getInstructionCost(I, VF).first; | ||||||||||
InstructionCost VectorCost = getInstructionCost(I, VF); | ||||||||||
|
||||||||||
// Compute the cost of the scalarized instruction. This cost is the cost of | ||||||||||
// the instruction as if it wasn't if-converted and instead remained in the | ||||||||||
// predicated block. We will scale this cost by block probability after | ||||||||||
// computing the scalarization overhead. | ||||||||||
InstructionCost ScalarCost = | ||||||||||
VF.getFixedValue() * | ||||||||||
getInstructionCost(I, ElementCount::getFixed(1)).first; | ||||||||||
VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)); | ||||||||||
|
||||||||||
// Compute the scalarization overhead of needed insertelement instructions | ||||||||||
// and phi nodes. | ||||||||||
|
@@ -5769,14 +5847,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( | |||||||||
return Discount; | ||||||||||
} | ||||||||||
|
||||||||||
LoopVectorizationCostModel::VectorizationCostTy | ||||||||||
LoopVectorizationCostModel::expectedCost( | ||||||||||
InstructionCost LoopVectorizationCostModel::expectedCost( | ||||||||||
ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { | ||||||||||
VectorizationCostTy Cost; | ||||||||||
InstructionCost Cost; | ||||||||||
|
||||||||||
// For each block. | ||||||||||
for (BasicBlock *BB : TheLoop->blocks()) { | ||||||||||
VectorizationCostTy BlockCost; | ||||||||||
InstructionCost BlockCost; | ||||||||||
|
||||||||||
// For each instruction in the old loop. | ||||||||||
for (Instruction &I : BB->instructionsWithoutDebug()) { | ||||||||||
|
@@ -5785,22 +5862,19 @@ LoopVectorizationCostModel::expectedCost( | |||||||||
(VF.isVector() && VecValuesToIgnore.count(&I))) | ||||||||||
continue; | ||||||||||
|
||||||||||
VectorizationCostTy C = getInstructionCost(&I, VF); | ||||||||||
InstructionCost C = getInstructionCost(&I, VF); | ||||||||||
|
||||||||||
// Check if we should override the cost. | ||||||||||
if (C.first.isValid() && | ||||||||||
ForceTargetInstructionCost.getNumOccurrences() > 0) | ||||||||||
C.first = InstructionCost(ForceTargetInstructionCost); | ||||||||||
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) | ||||||||||
C = InstructionCost(ForceTargetInstructionCost); | ||||||||||
|
||||||||||
// Keep a list of instructions with invalid costs. | ||||||||||
if (Invalid && !C.first.isValid()) | ||||||||||
if (Invalid && !C.isValid()) | ||||||||||
Invalid->emplace_back(&I, VF); | ||||||||||
|
||||||||||
BlockCost.first += C.first; | ||||||||||
BlockCost.second |= C.second; | ||||||||||
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first | ||||||||||
<< " for VF " << VF << " For instruction: " << I | ||||||||||
<< '\n'); | ||||||||||
BlockCost += C; | ||||||||||
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " | ||||||||||
<< VF << " For instruction: " << I << '\n'); | ||||||||||
} | ||||||||||
|
||||||||||
// If we are vectorizing a predicated block, it will have been | ||||||||||
|
@@ -5811,10 +5885,9 @@ LoopVectorizationCostModel::expectedCost( | |||||||||
// cost by the probability of executing it. blockNeedsPredication from | ||||||||||
// Legal is used so as to not include all blocks in tail folded loops. | ||||||||||
if (VF.isScalar() && Legal->blockNeedsPredication(BB)) | ||||||||||
BlockCost.first /= getReciprocalPredBlockProb(); | ||||||||||
BlockCost /= getReciprocalPredBlockProb(); | ||||||||||
|
||||||||||
Cost.first += BlockCost.first; | ||||||||||
Cost.second |= BlockCost.second; | ||||||||||
Cost += BlockCost; | ||||||||||
} | ||||||||||
|
||||||||||
return Cost; | ||||||||||
|
@@ -6213,49 +6286,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, | |||||||||
return getWideningCost(I, VF); | ||||||||||
} | ||||||||||
|
||||||||||
LoopVectorizationCostModel::VectorizationCostTy | ||||||||||
LoopVectorizationCostModel::getInstructionCost(Instruction *I, | ||||||||||
ElementCount VF) { | ||||||||||
// If we know that this instruction will remain uniform, check the cost of | ||||||||||
// the scalar version. | ||||||||||
if (isUniformAfterVectorization(I, VF)) | ||||||||||
VF = ElementCount::getFixed(1); | ||||||||||
|
||||||||||
if (VF.isVector() && isProfitableToScalarize(I, VF)) | ||||||||||
return VectorizationCostTy(InstsToScalarize[VF][I], false); | ||||||||||
|
||||||||||
// Forced scalars do not have any scalarization overhead. | ||||||||||
auto ForcedScalar = ForcedScalars.find(VF); | ||||||||||
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { | ||||||||||
auto InstSet = ForcedScalar->second; | ||||||||||
if (InstSet.count(I)) | ||||||||||
return VectorizationCostTy( | ||||||||||
(getInstructionCost(I, ElementCount::getFixed(1)).first * | ||||||||||
VF.getKnownMinValue()), | ||||||||||
false); | ||||||||||
} | ||||||||||
|
||||||||||
Type *VectorTy; | ||||||||||
InstructionCost C = getInstructionCost(I, VF, VectorTy); | ||||||||||
|
||||||||||
bool TypeNotScalarized = false; | ||||||||||
if (VF.isVector() && VectorTy->isVectorTy()) { | ||||||||||
if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { | ||||||||||
if (VF.isScalable()) | ||||||||||
// <vscale x 1 x iN> is assumed to be profitable over iN because | ||||||||||
// scalable registers are a distinct register class from scalar ones. | ||||||||||
// If we ever find a target which wants to lower scalable vectors | ||||||||||
// back to scalars, we'll need to update this code to explicitly | ||||||||||
// ask TTI about the register class uses for each part. | ||||||||||
TypeNotScalarized = NumParts <= VF.getKnownMinValue(); | ||||||||||
else | ||||||||||
TypeNotScalarized = NumParts < VF.getKnownMinValue(); | ||||||||||
} else | ||||||||||
C = InstructionCost::getInvalid(); | ||||||||||
} | ||||||||||
return VectorizationCostTy(C, TypeNotScalarized); | ||||||||||
} | ||||||||||
|
||||||||||
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( | ||||||||||
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { | ||||||||||
|
||||||||||
|
@@ -6646,8 +6676,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { | |||||||||
} | ||||||||||
|
||||||||||
InstructionCost | ||||||||||
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, | ||||||||||
Type *&VectorTy) { | ||||||||||
LoopVectorizationCostModel::getInstructionCost(Instruction *I, | ||||||||||
ElementCount VF) { | ||||||||||
// If we know that this instruction will remain uniform, check the cost of | ||||||||||
// the scalar version. | ||||||||||
if (isUniformAfterVectorization(I, VF)) | ||||||||||
VF = ElementCount::getFixed(1); | ||||||||||
|
||||||||||
if (VF.isVector() && isProfitableToScalarize(I, VF)) | ||||||||||
return InstsToScalarize[VF][I]; | ||||||||||
|
||||||||||
// Forced scalars do not have any scalarization overhead. | ||||||||||
auto ForcedScalar = ForcedScalars.find(VF); | ||||||||||
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { | ||||||||||
auto InstSet = ForcedScalar->second; | ||||||||||
if (InstSet.count(I)) | ||||||||||
return getInstructionCost(I, ElementCount::getFixed(1)) * | ||||||||||
VF.getKnownMinValue(); | ||||||||||
} | ||||||||||
Comment on lines
+6690
to
+6696
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Independent of this patch: if ForcedScalars records its cost, as in InstsToScalarize and WideningDecisions, then ForcedScalars[VF][I] can be returned if exists, similar to InstsToScalarize, rather than invoking a recursive call. Could ForcedScalars be folded into InstsToScalarize. |
||||||||||
|
||||||||||
Type *RetTy = I->getType(); | ||||||||||
if (canTruncateToMinimalBitwidth(I, VF)) | ||||||||||
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); | ||||||||||
|
@@ -6670,6 +6717,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, | |||||||||
}; | ||||||||||
(void) hasSingleCopyAfterVectorization; | ||||||||||
|
||||||||||
Type *VectorTy; | ||||||||||
if (isScalarAfterVectorization(I, VF)) { | ||||||||||
// With the exception of GEPs and PHIs, after scalarization there should | ||||||||||
// only be one copy of the instruction generated in the loop. This is | ||||||||||
|
@@ -6685,6 +6733,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, | |||||||||
} else | ||||||||||
VectorTy = ToVectorTy(RetTy, VF); | ||||||||||
|
||||||||||
if (VF.isVector() && VectorTy->isVectorTy() && | ||||||||||
!TTI.getNumberOfParts(VectorTy)) | ||||||||||
return InstructionCost::getInvalid(); | ||||||||||
|
||||||||||
// TODO: We need to estimate the cost of intrinsic calls. | ||||||||||
switch (I->getOpcode()) { | ||||||||||
case Instruction::GetElementPtr: | ||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could
VectorizationCostTy
be retired?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed, thanks!