Skip to content

Commit

Permalink
[LV] Move check if any vector insts will be generated to VPlan.
Browse files Browse the repository at this point in the history
This patch moves the check if any vector instructions will be generated
from getInstructionCost to be based on VPlan. This simplifies
getInstructionCost, is more accurate as we check the final result and
also allows us to exit early once we visit a recipe that generates
vector instructions.

The helper can then be re-used by the VPlan-based cost model to match
the legacy selectVectorizationFactor behavior, this fixing a crash and
paving the way to recommit #92555.
  • Loading branch information
fhahn committed Jun 25, 2024
1 parent 4acc8ee commit 14d39c0
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 100 deletions.
146 changes: 81 additions & 65 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1648,12 +1648,7 @@ class LoopVectorizationCostModel {

/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);

/// The cost-computation logic from getInstructionCost which provides
/// the vector type as an output parameter.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
Type *&VectorTy);
InstructionCost getInstructionCost(Instruction *I, ElementCount VF);

/// Return the cost of instructions in an inloop reduction pattern, if I is
/// part of that pattern.
Expand Down Expand Up @@ -4879,6 +4874,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
} while (!Tail.empty());
}

static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
const TargetTransformInfo &TTI) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
Plan.getCanonicalIV()->getScalarType()->getContext());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : *VPBB) {
if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
VPVectorPointerRecipe>(&R))
continue;

auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
Type *ScalarTy = TypeInfo.inferScalarType(VPV);
Type *VectorTy = ToVectorTy(ScalarTy, VF);
unsigned NumParts = TTI.getNumberOfParts(VectorTy);
if (!NumParts)
return false;
if (VF.isScalable())
// <vscale x 1 x iN> is assumed to be profitable over iN because
// scalable registers are a distinct register class from scalar ones.
// If we ever find a target which wants to lower scalable vectors
// back to scalars, we'll need to update this code to explicitly
// ask TTI about the register class uses for each part.
return NumParts <= VF.getKnownMinValue();
else
return NumParts < VF.getKnownMinValue();
};
SmallVector<VPValue *> VPValuesToCheck;
if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
VPValuesToCheck.push_back(WidenStore->getOperand(1));
} else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
append_range(VPValuesToCheck, IG->getStoredValues());
} else {
append_range(VPValuesToCheck, R.definedValues());
}
if (any_of(VPValuesToCheck,
[&WillWiden](VPValue *VPV) { return WillWiden(VPV); }))
return true;
}
}

return false;
}

VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost ExpectedCost =
CM.expectedCost(ElementCount::getFixed(1)).first;
Expand Down Expand Up @@ -4929,7 +4970,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
LLVM_DEBUG(dbgs() << ".\n");
#endif

if (!C.second && !ForceVectorization) {
if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
LLVM_DEBUG(
dbgs()
<< "LV: Not considering vector loop of width " << VF
Expand Down Expand Up @@ -5801,15 +5842,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(

// Compute the cost of the vector instruction. Note that this cost already
// includes the scalarization overhead of the predicated instruction.
InstructionCost VectorCost = getInstructionCost(I, VF).first;
InstructionCost VectorCost = getInstructionCost(I, VF);

// Compute the cost of the scalarized instruction. This cost is the cost of
// the instruction as if it wasn't if-converted and instead remained in the
// predicated block. We will scale this cost by block probability after
// computing the scalarization overhead.
InstructionCost ScalarCost =
VF.getFixedValue() *
getInstructionCost(I, ElementCount::getFixed(1)).first;
VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));

// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
Expand Down Expand Up @@ -5869,22 +5909,19 @@ LoopVectorizationCostModel::expectedCost(
(VF.isVector() && VecValuesToIgnore.count(&I)))
continue;

VectorizationCostTy C = getInstructionCost(&I, VF);
InstructionCost C = getInstructionCost(&I, VF);

// Check if we should override the cost.
if (C.first.isValid() &&
ForceTargetInstructionCost.getNumOccurrences() > 0)
C.first = InstructionCost(ForceTargetInstructionCost);
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
C = InstructionCost(ForceTargetInstructionCost);

// Keep a list of instructions with invalid costs.
if (Invalid && !C.first.isValid())
if (Invalid && !C.isValid())
Invalid->emplace_back(&I, VF);

BlockCost.first += C.first;
BlockCost.second |= C.second;
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
<< " for VF " << VF << " For instruction: " << I
<< '\n');
BlockCost.first += C;
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
<< VF << " For instruction: " << I << '\n');
}

// If we are vectorizing a predicated block, it will have been
Expand Down Expand Up @@ -6297,49 +6334,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
return getWideningCost(I, VF);
}

LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ElementCount VF) {
// If we know that this instruction will remain uniform, check the cost of
// the scalar version.
if (isUniformAfterVectorization(I, VF))
VF = ElementCount::getFixed(1);

if (VF.isVector() && isProfitableToScalarize(I, VF))
return VectorizationCostTy(InstsToScalarize[VF][I], false);

// Forced scalars do not have any scalarization overhead.
auto ForcedScalar = ForcedScalars.find(VF);
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
auto InstSet = ForcedScalar->second;
if (InstSet.count(I))
return VectorizationCostTy(
(getInstructionCost(I, ElementCount::getFixed(1)).first *
VF.getKnownMinValue()),
false);
}

Type *VectorTy;
InstructionCost C = getInstructionCost(I, VF, VectorTy);

bool TypeNotScalarized = false;
if (VF.isVector() && VectorTy->isVectorTy()) {
if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
if (VF.isScalable())
// <vscale x 1 x iN> is assumed to be profitable over iN because
// scalable registers are a distinct register class from scalar ones.
// If we ever find a target which wants to lower scalable vectors
// back to scalars, we'll need to update this code to explicitly
// ask TTI about the register class uses for each part.
TypeNotScalarized = NumParts <= VF.getKnownMinValue();
else
TypeNotScalarized = NumParts < VF.getKnownMinValue();
} else
C = InstructionCost::getInvalid();
}
return VectorizationCostTy(C, TypeNotScalarized);
}

InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {

Expand Down Expand Up @@ -6730,8 +6724,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
}

InstructionCost
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *&VectorTy) {
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ElementCount VF) {
// If we know that this instruction will remain uniform, check the cost of
// the scalar version.
if (isUniformAfterVectorization(I, VF))
VF = ElementCount::getFixed(1);

if (VF.isVector() && isProfitableToScalarize(I, VF))
return InstsToScalarize[VF][I];

// Forced scalars do not have any scalarization overhead.
auto ForcedScalar = ForcedScalars.find(VF);
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
auto InstSet = ForcedScalar->second;
if (InstSet.count(I))
return getInstructionCost(I, ElementCount::getFixed(1)) *
VF.getKnownMinValue();
}

Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
Expand All @@ -6754,6 +6765,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
};
(void) hasSingleCopyAfterVectorization;

Type *VectorTy;
if (isScalarAfterVectorization(I, VF)) {
// With the exception of GEPs and PHIs, after scalarization there should
// only be one copy of the instruction generated in the loop. This is
Expand All @@ -6769,6 +6781,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
} else
VectorTy = ToVectorTy(RetTy, VF);

if (VF.isVector() && VectorTy->isVectorTy() &&
!TTI.getNumberOfParts(VectorTy))
return InstructionCost::getInvalid();

// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::PtrAdd:
// Return the type based on the pointer argument (i.e. first operand).
return inferScalarType(R->getOperand(0));
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
return Type::getVoidTy(Ctx);
default:
break;
}
Expand Down Expand Up @@ -248,8 +251,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
[](const auto *R) { return R->getScalarType(); })
.Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
VPWidenGEPRecipe>([this](const VPRecipeBase *R) {
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) {
return inferScalarType(R->getOperand(0));
})
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s

define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
;CHECK: vector.body:
; CHECK-NOT: vector.body:
entry:
%0 = alloca i8, align 1
br label %loop
Expand Down
31 changes: 3 additions & 28 deletions llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -622,38 +622,15 @@ define void @wide_iv_trunc_reuse(ptr %dst) {
; CHECK-LABEL: define void @wide_iv_trunc_reuse(
; CHECK-SAME: ptr [[DST:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 3
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 5
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], 6
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], 7
; CHECK-NEXT: store i32 [[TMP7]], ptr [[DST]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
; CHECK-NEXT: store i32 [[IV_2]], ptr [[DST]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 0
; CHECK-NEXT: [[IV_TRUNC]] = trunc i64 [[IV]] to i32
; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
Expand Down Expand Up @@ -701,6 +678,4 @@ attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" }
; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
;.
2 changes: 1 addition & 1 deletion llvm/test/Transforms/LoopVectorize/pr32859.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s

; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
; but the IR Verifier requires for PHI one entry for each predecessor of
Expand Down
7 changes: 4 additions & 3 deletions llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S %s | FileCheck %s

; This test used to crash due to missing Or/Not cases in inferScalarTypeForRecipe.
define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
Expand Down Expand Up @@ -65,8 +65,9 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
; CHECK-NEXT: br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
Expand Down

0 comments on commit 14d39c0

Please sign in to comment.