Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VPlan] Port invalid cost remarks to VPlan. #99322

Merged
merged 5 commits into from
Jul 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,9 @@ class LoopVectorizationPlanner {
VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC);

/// Emit remarks for recipes with invalid costs in the available VPlans.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);

protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
Expand Down
158 changes: 95 additions & 63 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
Expand Down Expand Up @@ -889,20 +890,18 @@ static void debugVectorizationMessage(const StringRef Prefix,
/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
/// RemarkName is the identifier for the remark. If \p I is passed it is an
/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
/// the location of the remark. \return the remark object that can be
/// streamed to.
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
StringRef RemarkName, Loop *TheLoop, Instruction *I) {
Value *CodeRegion = TheLoop->getHeader();
DebugLoc DL = TheLoop->getStartLoc();

if (I) {
CodeRegion = I->getParent();
// If there is no debug location attached to the instruction, revert back to
// using the loop's.
if (I->getDebugLoc())
DL = I->getDebugLoc();
}
/// the location of the remark. If \p DL is passed, use it as debug location for
/// the remark. \return the remark object that can be streamed to.
static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
Instruction *I, DebugLoc DL = {}) {
Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
// If debug location is attached to the instruction, use it. Otherwise if DL
// was not provided, use the loop's.
if (I && I->getDebugLoc())
DL = I->getDebugLoc();
else if (!DL)
DL = TheLoop->getStartLoc();
Comment on lines +898 to +904
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(post commit): slight discrepancy with documentation above, should this instead read

  // If debug location is provided, use it. Otherwise if debug location is attached to the instruction, use it. 
  // Otherwise use the start location of the loop.
  if (!DL)
    DL = (I && I->getDebugLoc()) ? I->getDebugLoc() : TheLoop->getStartLoc();


return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
}
Expand Down Expand Up @@ -943,15 +942,17 @@ void reportVectorizationFailure(const StringRef DebugMsg,

/// Reports an informative message: print \p Msg for debugging purposes as well
/// as an optimization remark. Uses either \p I as location of the remark, or
/// otherwise \p TheLoop.
/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
/// remark. If \p DL is passed, use it as debug location for the remark.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Last sentence repeated twice.

static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
Instruction *I = nullptr) {
OptimizationRemarkEmitter *ORE,
Loop *TheLoop, Instruction *I = nullptr,
DebugLoc DL = {}) {
LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
ORE->emit(
createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
<< Msg);
ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
I, DL)
<< Msg);
}

/// Report successful vectorization of the loop. In case an outer loop is
Expand Down Expand Up @@ -1538,12 +1539,8 @@ class LoopVectorizationCostModel {
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width. If \p Invalid is not nullptr, this function
/// will add a pair(Instruction*, ElementCount) to \p Invalid for
/// each instruction that has an Invalid cost for the given VF.
InstructionCost
expectedCost(ElementCount VF,
SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
/// the factor width.
InstructionCost expectedCost(ElementCount VF);

bool hasPredStores() const { return NumPredStores > 0; }

Expand Down Expand Up @@ -4350,24 +4347,38 @@ bool LoopVectorizationPlanner::isMoreProfitable(
return CmpFn(RTCostA, RTCostB);
}

static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
OptimizationRemarkEmitter *ORE,
Loop *TheLoop) {
void LoopVectorizationPlanner::emitInvalidCostRemarks(
OptimizationRemarkEmitter *ORE) {
using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
SmallVector<RecipeVFPair> InvalidCosts;
for (const auto &Plan : VPlans) {
for (ElementCount VF : Plan->vectorFactors()) {
VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx,
CM);
Comment on lines +4357 to +4358
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CostCtx is kept here rather than hoisting it alongside LLVMCtx above, due to its caching of SkipCostComputation. But the latter is initialized by calling LVP::cost(Plan, VF), whereas here all recipes are asked for their cost directly, w/o going through LVP::cost(). Should LVP::cost() be called first, and iff it returns invalid traverse the recipes? Or note that invalid costs cannot be skipped(?), so calling LVP::cost() is redundant when only invalid costs are sought, in which case CostCtx can be hoisted(?)

auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (auto &R : *VPBB) {
if (!R.cost(VF, CostCtx).isValid())
InvalidCosts.emplace_back(&R, VF);
}
}
}
}
if (InvalidCosts.empty())
return;

// Emit a report of VFs with invalid costs in the loop.

// Group the remarks per instruction, keeping the instruction order from
// InvalidCosts.
std::map<Instruction *, unsigned> Numbering;
// Group the remarks per recipe, keeping the recipe order from InvalidCosts.
DenseMap<VPRecipeBase *, unsigned> Numbering;
unsigned I = 0;
for (auto &Pair : InvalidCosts)
if (!Numbering.count(Pair.first))
Numbering[Pair.first] = I++;

// Sort the list, first on instruction(number) then on VF.
sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
// Sort the list, first on recipe(number) then on VF.
sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
if (Numbering[A.first] != Numbering[B.first])
return Numbering[A.first] < Numbering[B.first];
const auto &LHS = A.second;
Expand All @@ -4376,38 +4387,64 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
});

// For a list of ordered instruction-vf pairs:
// [(load, vf1), (load, vf2), (store, vf1)]
// Group the instructions together to emit separate remarks for:
// load (vf1, vf2)
// store (vf1)
auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
auto Subset = ArrayRef<InstructionVFPair>();
// For a list of ordered recipe-VF pairs:
// [(load, VF1), (load, VF2), (store, VF1)]
// group the recipes together to emit separate remarks for:
// load (VF1, VF2)
// store (VF1)
auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
auto Subset = ArrayRef<RecipeVFPair>();
do {
if (Subset.empty())
Subset = Tail.take_front(1);

Instruction *I = Subset.front().first;

// If the next instruction is different, or if there are no other pairs,
VPRecipeBase *R = Subset.front().first;

unsigned Opcode =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This retains current dumps, a worthy (temporary) objective, but deserves further attention - recipes (including those with invalid cost) should arguably print themselves, as in R.print(), perhaps supporting a shorter printing of their "opcode" only?

TypeSwitch<const VPRecipeBase *, unsigned>(R)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all cases handled, default deemed unreachable?

.Case<VPHeaderPHIRecipe>(
[](const auto *R) { return Instruction::PHI; })
.Case<VPWidenSelectRecipe>(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
.Case<VPWidenLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe>(
[](const auto *R) { return Instruction::Call; })
.Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCastRecipe>(
[](const auto *R) { return R->getOpcode(); })
.Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
return R->getStoredValues().empty() ? Instruction::Load
: Instruction::Store;
});

// If the next recipe is different, or if there are no other pairs,
// emit a remark for the collated subset. e.g.
// [(load, vf1), (load, vf2))]
// [(load, VF1), (load, VF2))]
// to emit:
// remark: invalid costs for 'load' at VF=(vf, vf2)
if (Subset == Tail || Tail[Subset.size()].first != I) {
// remark: invalid costs for 'load' at VF=(VF1, VF2)
if (Subset == Tail || Tail[Subset.size()].first != R) {
std::string OutString;
raw_string_ostream OS(OutString);
assert(!Subset.empty() && "Unexpected empty range");
OS << "Instruction with invalid costs prevented vectorization at VF=(";
OS << "Recipe with invalid costs prevented vectorization at VF=(";
for (const auto &Pair : Subset)
OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
OS << "):";
if (auto *CI = dyn_cast<CallInst>(I))
OS << " call to " << CI->getCalledFunction()->getName();
else
OS << " " << I->getOpcodeName();
if (Opcode == Instruction::Call) {
auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
Function *CalledFn =
WidenCall ? WidenCall->getCalledScalarFunction()
: cast<Function>(R->getOperand(R->getNumOperands() - 1)
->getLiveInIRValue());
Comment on lines +4440 to +4441
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth at-least a comment or assert, noting that if not WidenCall then R is replicating a CallInst, both having the called function as their last operand. Better have ReplicateRecipe (or a derivative thereof) provide the called function explicitly (or print its name), to avoid bypassing CallInst's getCalledFunction(), also noting that recipes may use their last operand for an optional mask.

OS << " call to " << CalledFn->getName();
} else
OS << " " << Instruction::getOpcodeName(Opcode);
OS.flush();
reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
R->getDebugLoc());
Tail = Tail.drop_front(Subset.size());
Subset = {};
} else
Expand Down Expand Up @@ -4536,14 +4573,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
ChosenFactor.Cost = InstructionCost::getMax();
}

SmallVector<InstructionVFPair> InvalidCosts;
for (auto &P : VPlans) {
for (ElementCount VF : P->vectorFactors()) {
// The cost for scalar VF=1 is already calculated, so ignore it.
if (VF.isScalar())
continue;

InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
InstructionCost C = CM.expectedCost(VF);
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);

#ifndef NDEBUG
Expand Down Expand Up @@ -4578,8 +4614,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
}
}

emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);

if (!EnableCondStoresVectorization && CM.hasPredStores()) {
reportVectorizationFailure(
"There are conditional stores.",
Expand Down Expand Up @@ -5484,8 +5518,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
return Discount;
}

InstructionCost LoopVectorizationCostModel::expectedCost(
ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
InstructionCost Cost;

// For each block.
Expand All @@ -5505,10 +5538,6 @@ InstructionCost LoopVectorizationCostModel::expectedCost(
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
C = InstructionCost(ForceTargetInstructionCost);

// Keep a list of instructions with invalid costs.
if (Invalid && !C.isValid())
Invalid->emplace_back(&I, VF);

BlockCost += C;
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
<< VF << " For instruction: " << I << '\n');
Expand Down Expand Up @@ -9871,6 +9900,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Plan how to best vectorize, return the best VF and its cost.
std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);

if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);

VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
; RUN: FileCheck %s --check-prefix=CHECK-REMARKS < %t

; CHECK-REMARKS: UserVF ignored because of invalid costs.
; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): alloca
; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
; CHECK-REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): alloca
; CHECK-REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @alloca(ptr %vla, i64 %N) {
; CHECK-LABEL: @alloca(
; CHECK-NOT: <vscale x
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ for.end:
}

; CHECK-REMARKS: UserVF ignored because of invalid costs.
; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_no_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
; CHECK: @vec_sin_no_mapping
; CHECK: call fast <2 x float> @llvm.sin.v2f32
Expand All @@ -127,10 +127,10 @@ for.cond.cleanup: ; preds = %for.body
}

; CHECK-REMARKS: UserVF ignored because of invalid costs.
; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:40: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:40: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_no_mapping_ite(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
; CHECK: @vec_sin_no_mapping_ite
; CHECK-NOT: <vscale x
Expand Down Expand Up @@ -163,9 +163,9 @@ for.cond.cleanup: ; preds = %for.body
}

; CHECK-REMARKS: UserVF ignored because of invalid costs.
; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_fixed_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
; CHECK: @vec_sin_fixed_mapping
; CHECK: call fast <2 x float> @llvm.sin.v2f32
Expand Down
Loading