Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VPlan] Port invalid cost remarks to VPlan. #99322

Merged
merged 5 commits into from
Jul 27, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,9 @@ class LoopVectorizationPlanner {
VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC);

/// Emit remarks for recipes with invalid costs in the available VPlans.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);

protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
Expand Down
110 changes: 75 additions & 35 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
Expand Down Expand Up @@ -891,10 +892,12 @@ static void debugVectorizationMessage(const StringRef Prefix,
/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
/// the location of the remark. \return the remark object that can be
/// streamed to.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add documentation for \p DL?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added, thanks!

static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
StringRef RemarkName, Loop *TheLoop, Instruction *I) {
static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
Instruction *I, DebugLoc DL = {}) {
Value *CodeRegion = TheLoop->getHeader();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps slightly clarify the precedence. If DL is provided, should it have highest precedence?

Suggested change
Value *CodeRegion = TheLoop->getHeader();
Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
// If debug location is attached to the instruction, use it. Otherwise if DL was not provided, use the loop's.
if (I && I->getdebugLoc())
DL = I->getDebugLoc();
else if (!DL)
DL = TheLoop->getStartLoc();

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adjusted, thanks!

DebugLoc DL = TheLoop->getStartLoc();
if (!DL)
DL = TheLoop->getStartLoc();

if (I) {
CodeRegion = I->getParent();
Expand Down Expand Up @@ -945,13 +948,14 @@ void reportVectorizationFailure(const StringRef DebugMsg,
/// as an optimization remark. Uses either \p I as location of the remark, or
/// otherwise \p TheLoop.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about \p DL?

static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
Instruction *I = nullptr) {
OptimizationRemarkEmitter *ORE,
Loop *TheLoop, Instruction *I = nullptr,
DebugLoc DL = {}) {
LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
ORE->emit(
createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
<< Msg);
ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
I, DL)
<< Msg);
}

/// Report successful vectorization of the loop. In case an outer loop is
Expand Down Expand Up @@ -1541,9 +1545,7 @@ class LoopVectorizationCostModel {
/// the factor width. If \p Invalid is not nullptr, this function
/// will add a pair(Instruction*, ElementCount) to \p Invalid for
/// each instruction that has an Invalid cost for the given VF.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drop last sentence about \p Invalid?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed thanks!

InstructionCost
expectedCost(ElementCount VF,
SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
InstructionCost expectedCost(ElementCount VF);

bool hasPredStores() const { return NumPredStores > 0; }

Expand Down Expand Up @@ -4343,24 +4345,43 @@ bool LoopVectorizationPlanner::isMoreProfitable(
return CmpFn(RTCostA, RTCostB);
}

static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
OptimizationRemarkEmitter *ORE,
Loop *TheLoop) {
void LoopVectorizationPlanner::emitInvalidCostRemarks(
OptimizationRemarkEmitter *ORE) {
if (VPlans.empty())
return;

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (VPlans.empty())
return;

Can be dropped.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dropped, thanks!

using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
SmallVector<RecipeVFPair> InvalidCosts;
for (const auto &Plan : VPlans) {
for (ElementCount VF : Plan->vectorFactors()) {
LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx,
CM);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be defined at the outset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately not at the moment, as we mark underlying instructions for VPReplicateRecipes to only compute the cost once, which means we would skip them in some cases

Comment on lines +4357 to +4358
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CostCtx is kept here rather than hoisting it alongside LLVMCtx above, due to its caching of SkipCostComputation. But the latter is initialized by calling LVP::cost(Plan, VF), whereas here all recipes are asked for their cost directly, w/o going through LVP::cost(). Should LVP::cost() be called first, and iff it returns invalid traverse the recipes? Or note that invalid costs cannot be skipped(?), so calling LVP::cost() is redundant when only invalid costs are sought, in which case CostCtx can be hoisted(?)

auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (auto &R : *VPBB) {
if (R.cost(VF, CostCtx).isValid())
continue;
InvalidCosts.emplace_back(&R, VF);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (R.cost(VF, CostCtx).isValid())
continue;
InvalidCosts.emplace_back(&R, VF);
if (!R.cost(VF, CostCtx).isValid())
InvalidCosts.emplace_back(&R, VF);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks!

}
}
}
}
if (InvalidCosts.empty())
return;

// Emit a report of VFs with invalid costs in the loop.

// Group the remarks per instruction, keeping the instruction order from
// InvalidCosts.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Group the remarks per instruction, keeping the instruction order from
// InvalidCosts.
// Group the remarks per recipe, keeping the recipe order from InvalidCosts.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

std::map<Instruction *, unsigned> Numbering;
DenseMap<VPRecipeBase *, unsigned> Numbering;
unsigned I = 0;
for (auto &Pair : InvalidCosts)
if (!Numbering.count(Pair.first))
Numbering[Pair.first] = I++;

// Sort the list, first on instruction(number) then on VF.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Sort the list, first on instruction(number) then on VF.
// Sort the list, first on recipe(number) then on VF.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks!

sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
if (Numbering[A.first] != Numbering[B.first])
return Numbering[A.first] < Numbering[B.first];
const auto &LHS = A.second;
Expand All @@ -4374,33 +4395,57 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
// Group the instructions together to emit separate remarks for:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Group the instructions together to emit separate remarks for:
// For a list of ordered recipe-VF pairs:
// [(load, VF1), (load, VF2), (store, VF1)]
// group the recipes together to emit separate remarks for:

say something about Opcodes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks!

// load (vf1, vf2)
// store (vf1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// load (vf1, vf2)
// store (vf1)
// load (VF1, VF2)
// store (VF1)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated , thanks!

auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
auto Subset = ArrayRef<InstructionVFPair>();
auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
auto Subset = ArrayRef<RecipeVFPair>();
do {
if (Subset.empty())
Subset = Tail.take_front(1);

Instruction *I = Subset.front().first;
VPRecipeBase *R = Subset.front().first;

unsigned Opcode =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This retains current dumps, a worthy (temporary) objective, but deserves further attention - recipes (including those with invalid cost) should arguably print themselves, as in R.print(), perhaps supporting a shorter printing of their "opcode" only?

TypeSwitch<const VPRecipeBase *, unsigned>(R)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all cases handled, default deemed unreachable?

.Case<VPHeaderPHIRecipe>(
[](const auto *R) { return Instruction::PHI; })
.Case<VPWidenSelectRecipe>(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
.Case<VPWidenLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe>(
[](const auto *R) { return Instruction::Call; })
.Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCastRecipe>(
[](const auto *R) { return R->getOpcode(); })
.Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
return R->getStoredValues().empty() ? Instruction::Load
: Instruction::Store;
});

// If the next instruction is different, or if there are no other pairs,
// emit a remark for the collated subset. e.g.
// [(load, vf1), (load, vf2))]
// to emit:
// remark: invalid costs for 'load' at VF=(vf, vf2)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// If the next instruction is different, or if there are no other pairs,
// emit a remark for the collated subset. e.g.
// [(load, vf1), (load, vf2))]
// to emit:
// remark: invalid costs for 'load' at VF=(vf, vf2)
// If the next recipe is different, or if there are no other pairs,
// emit a remark for the collated subset. e.g.
// [(load, VF1), (load, VF2))]
// to emit:
// remark: invalid costs for 'load' at VF=(VF1, VF2)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks!

if (Subset == Tail || Tail[Subset.size()].first != I) {
if (Subset == Tail || Tail[Subset.size()].first != R) {
std::string OutString;
raw_string_ostream OS(OutString);
assert(!Subset.empty() && "Unexpected empty range");
OS << "Instruction with invalid costs prevented vectorization at VF=(";
OS << "Recipe with invalid costs prevented vectorization at VF=(";
for (const auto &Pair : Subset)
OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
OS << "):";
if (auto *CI = dyn_cast<CallInst>(I))
OS << " call to " << CI->getCalledFunction()->getName();
if (Opcode == Instruction::Call)
OS << " call to "
<< R->getOperand(R->getNumOperands() - 1)
->getLiveInIRValue()
->getName();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better have VPWidenCallRecipe and VPReplicateRecipe-of-a-Call take care of printing themselves via getCalledScalarFunction and underlying.getCalledFunction(), respectively?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

else
OS << " " << I->getOpcodeName();
OS << " " << Instruction::getOpcodeName(Opcode);
OS.flush();
reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
R->getDebugLoc());
Tail = Tail.drop_front(Subset.size());
Subset = {};
} else
Expand Down Expand Up @@ -4529,14 +4574,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
ChosenFactor.Cost = InstructionCost::getMax();
}

SmallVector<InstructionVFPair> InvalidCosts;
for (auto &P : VPlans) {
for (ElementCount VF : P->vectorFactors()) {
// The cost for scalar VF=1 is already calculated, so ignore it.
if (VF.isScalar())
continue;

InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
InstructionCost C = CM.expectedCost(VF);
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);

#ifndef NDEBUG
Expand Down Expand Up @@ -4571,8 +4615,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
}
}

emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);

if (!EnableCondStoresVectorization && CM.hasPredStores()) {
reportVectorizationFailure(
"There are conditional stores.",
Expand Down Expand Up @@ -5477,8 +5519,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
return Discount;
}

InstructionCost LoopVectorizationCostModel::expectedCost(
ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
InstructionCost Cost;

// For each block.
Expand All @@ -5498,10 +5539,6 @@ InstructionCost LoopVectorizationCostModel::expectedCost(
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
C = InstructionCost(ForceTargetInstructionCost);

// Keep a list of instructions with invalid costs.
if (Invalid && !C.isValid())
Invalid->emplace_back(&I, VF);

BlockCost += C;
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
<< VF << " For instruction: " << I << '\n');
Expand Down Expand Up @@ -9841,6 +9878,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Plan how to best vectorize, return the best VF and its cost.
std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);

if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);

VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
; RUN: FileCheck %s --check-prefix=CHECK-REMARKS < %t

; CHECK-REMARKS: UserVF ignored because of invalid costs.
; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): alloca
; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
; CHECK-REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): alloca
; CHECK-REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @alloca(ptr %vla, i64 %N) {
; CHECK-LABEL: @alloca(
; CHECK-NOT: <vscale x
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ for.end:
}

; CHECK-REMARKS: UserVF ignored because of invalid costs.
; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_no_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
; CHECK: @vec_sin_no_mapping
; CHECK: call fast <2 x float> @llvm.sin.v2f32
Expand All @@ -127,10 +127,10 @@ for.cond.cleanup: ; preds = %for.body
}

; CHECK-REMARKS: UserVF ignored because of invalid costs.
; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:40: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:40: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_no_mapping_ite(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
; CHECK: @vec_sin_no_mapping_ite
; CHECK-NOT: <vscale x
Expand Down Expand Up @@ -163,9 +163,9 @@ for.cond.cleanup: ; preds = %for.body
}

; CHECK-REMARKS: UserVF ignored because of invalid costs.
; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_fixed_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
; CHECK: @vec_sin_fixed_mapping
; CHECK: call fast <2 x float> @llvm.sin.v2f32
Expand Down
Loading