[LV] Move check if any vector insts will be generated to VPlan. #96622

fhahn · 2024-06-25T11:16:20Z

This patch moves the check if any vector instructions will be generated from getInstructionCost to be based on VPlan. This simplifies getInstructionCost, is more accurate as we check the final result and also allows us to exit early once we visit a recipe that generates vector instructions.

The helper can then be re-used by the VPlan-based cost model to match the legacy selectVectorizationFactor behavior, this fixing a crash and paving the way to recommit #92555.

llvmbot · 2024-06-25T11:16:55Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-systemz

Author: Florian Hahn (fhahn)

Changes

This patch moves the check if any vector instructions will be generated from getInstructionCost to be based on VPlan. This simplifies getInstructionCost, is more accurate as we check the final result and also allows us to exit early once we visit a recipe that generates vector instructions.

The helper can then be re-used by the VPlan-based cost model to match the legacy selectVectorizationFactor behavior, this fixing a crash and paving the way to recommit #92555.

Full diff: https://github.com/llvm/llvm-project/pull/96622.diff

6 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+81-65)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+6-2)
(modified) llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll (+1-1)
(modified) llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll (+3-28)
(modified) llvm/test/Transforms/LoopVectorize/pr32859.ll (+1-1)
(modified) llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll (+4-3)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7835836d21ef1..516709acbe704 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1648,12 +1648,7 @@ class LoopVectorizationCostModel {
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
-  /// The cost-computation logic from getInstructionCost which provides
-  /// the vector type as an output parameter.
-  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
-                                     Type *&VectorTy);
+  InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
 
   /// Return the cost of instructions in an inloop reduction pattern, if I is
   /// part of that pattern.
@@ -4879,6 +4874,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
   } while (!Tail.empty());
 }
 
+static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
+                                           const TargetTransformInfo &TTI) {
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
+                          Plan.getCanonicalIV()->getScalarType()->getContext());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+    for (VPRecipeBase &R : *VPBB) {
+      if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
+              VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
+              VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
+              VPVectorPointerRecipe>(&R))
+        continue;
+
+      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);
+        Type *VectorTy = ToVectorTy(ScalarTy, VF);
+        unsigned NumParts = TTI.getNumberOfParts(VectorTy);
+        if (!NumParts)
+          return false;
+        if (VF.isScalable())
+          // <vscale x 1 x iN> is assumed to be profitable over iN because
+          // scalable registers are a distinct register class from scalar ones.
+          // If we ever find a target which wants to lower scalable vectors
+          // back to scalars, we'll need to update this code to explicitly
+          // ask TTI about the register class uses for each part.
+          return NumParts <= VF.getKnownMinValue();
+        else
+          return NumParts < VF.getKnownMinValue();
+      };
+      SmallVector<VPValue *> VPValuesToCheck;
+      if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
+        VPValuesToCheck.push_back(WidenStore->getOperand(1));
+      } else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
+        append_range(VPValuesToCheck, IG->getStoredValues());
+      } else {
+        append_range(VPValuesToCheck, R.definedValues());
+      }
+      if (any_of(VPValuesToCheck,
+                 [&WillWiden](VPValue *VPV) { return WillWiden(VPV); }))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
   InstructionCost ExpectedCost =
       CM.expectedCost(ElementCount::getFixed(1)).first;
@@ -4929,7 +4970,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       LLVM_DEBUG(dbgs() << ".\n");
 #endif
 
-      if (!C.second && !ForceVectorization) {
+      if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
         LLVM_DEBUG(
             dbgs()
             << "LV: Not considering vector loop of width " << VF
@@ -5801,15 +5842,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
 
     // Compute the cost of the vector instruction. Note that this cost already
     // includes the scalarization overhead of the predicated instruction.
-    InstructionCost VectorCost = getInstructionCost(I, VF).first;
+    InstructionCost VectorCost = getInstructionCost(I, VF);
 
     // Compute the cost of the scalarized instruction. This cost is the cost of
     // the instruction as if it wasn't if-converted and instead remained in the
     // predicated block. We will scale this cost by block probability after
     // computing the scalarization overhead.
     InstructionCost ScalarCost =
-        VF.getFixedValue() *
-        getInstructionCost(I, ElementCount::getFixed(1)).first;
+        VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
@@ -5869,22 +5909,19 @@ LoopVectorizationCostModel::expectedCost(
           (VF.isVector() && VecValuesToIgnore.count(&I)))
         continue;
 
-      VectorizationCostTy C = getInstructionCost(&I, VF);
+      InstructionCost C = getInstructionCost(&I, VF);
 
       // Check if we should override the cost.
-      if (C.first.isValid() &&
-          ForceTargetInstructionCost.getNumOccurrences() > 0)
-        C.first = InstructionCost(ForceTargetInstructionCost);
+      if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
+        C = InstructionCost(ForceTargetInstructionCost);
 
       // Keep a list of instructions with invalid costs.
-      if (Invalid && !C.first.isValid())
+      if (Invalid && !C.isValid())
         Invalid->emplace_back(&I, VF);
 
-      BlockCost.first += C.first;
-      BlockCost.second |= C.second;
-      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
-                        << " for VF " << VF << " For instruction: " << I
-                        << '\n');
+      BlockCost.first += C;
+      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
+                        << VF << " For instruction: " << I << '\n');
     }
 
     // If we are vectorizing a predicated block, it will have been
@@ -6297,49 +6334,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   return getWideningCost(I, VF);
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I,
-                                               ElementCount VF) {
-  // If we know that this instruction will remain uniform, check the cost of
-  // the scalar version.
-  if (isUniformAfterVectorization(I, VF))
-    VF = ElementCount::getFixed(1);
-
-  if (VF.isVector() && isProfitableToScalarize(I, VF))
-    return VectorizationCostTy(InstsToScalarize[VF][I], false);
-
-  // Forced scalars do not have any scalarization overhead.
-  auto ForcedScalar = ForcedScalars.find(VF);
-  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
-    auto InstSet = ForcedScalar->second;
-    if (InstSet.count(I))
-      return VectorizationCostTy(
-          (getInstructionCost(I, ElementCount::getFixed(1)).first *
-           VF.getKnownMinValue()),
-          false);
-  }
-
-  Type *VectorTy;
-  InstructionCost C = getInstructionCost(I, VF, VectorTy);
-
-  bool TypeNotScalarized = false;
-  if (VF.isVector() && VectorTy->isVectorTy()) {
-    if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
-      if (VF.isScalable())
-        // <vscale x 1 x iN> is assumed to be profitable over iN because
-        // scalable registers are a distinct register class from scalar ones.
-        // If we ever find a target which wants to lower scalable vectors
-        // back to scalars, we'll need to update this code to explicitly
-        // ask TTI about the register class uses for each part.
-        TypeNotScalarized = NumParts <= VF.getKnownMinValue();
-      else
-        TypeNotScalarized = NumParts < VF.getKnownMinValue();
-    } else
-      C = InstructionCost::getInvalid();
-  }
-  return VectorizationCostTy(C, TypeNotScalarized);
-}
-
 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
 
@@ -6730,8 +6724,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 }
 
 InstructionCost
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
-                                               Type *&VectorTy) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+                                               ElementCount VF) {
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (isUniformAfterVectorization(I, VF))
+    VF = ElementCount::getFixed(1);
+
+  if (VF.isVector() && isProfitableToScalarize(I, VF))
+    return InstsToScalarize[VF][I];
+
+  // Forced scalars do not have any scalarization overhead.
+  auto ForcedScalar = ForcedScalars.find(VF);
+  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
+    auto InstSet = ForcedScalar->second;
+    if (InstSet.count(I))
+      return getInstructionCost(I, ElementCount::getFixed(1)) *
+             VF.getKnownMinValue();
+  }
+
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6754,6 +6765,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   };
   (void) hasSingleCopyAfterVectorization;
 
+  Type *VectorTy;
   if (isScalarAfterVectorization(I, VF)) {
     // With the exception of GEPs and PHIs, after scalarization there should
     // only be one copy of the instruction generated in the loop. This is
@@ -6769,6 +6781,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   } else
     VectorTy = ToVectorTy(RetTy, VF);
 
+  if (VF.isVector() && VectorTy->isVectorTy() &&
+      !TTI.getNumberOfParts(VectorTy))
+    return InstructionCost::getInvalid();
+
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index eca5d1d4c5e1d..20dc53e094a74 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -68,6 +68,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::PtrAdd:
     // Return the type based on the pointer argument (i.e. first operand).
     return inferScalarType(R->getOperand(0));
+  case VPInstruction::BranchOnCond:
+  case VPInstruction::BranchOnCount:
+    return Type::getVoidTy(Ctx);
   default:
     break;
   }
@@ -248,8 +251,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
-          .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
-                VPWidenGEPRecipe>([this](const VPRecipeBase *R) {
+          .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
+                VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
+                VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) {
             return inferScalarType(R->getOperand(0));
           })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
index d8535d510ca03..27efd9fade741 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
 
 define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
-;CHECK: vector.body:
+; CHECK-NOT: vector.body:
 entry:
   %0 = alloca i8, align 1
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index d22ccf6671e84..92b2410d7853a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -622,38 +622,15 @@ define void @wide_iv_trunc_reuse(ptr %dst) {
 ; CHECK-LABEL: define void @wide_iv_trunc_reuse(
 ; CHECK-SAME: ptr [[DST:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], 7
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[IV_2]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 0
 ; CHECK-NEXT:    [[IV_TRUNC]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -701,6 +678,4 @@ attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" }
 ; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
-; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
-; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
index 24e713a7f2cff..a29a6bd735feb 100644
--- a/llvm/test/Transforms/LoopVectorize/pr32859.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s
 
 ; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
 ; but the IR Verifier requires for PHI one entry for each predecessor of
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
index 2007155fe5485..ef3b98d73a87f 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
 
 ; This test used to crash due to missing Or/Not cases in inferScalarTypeForRecipe.
 define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
@@ -65,8 +65,9 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:

fhahn · 2024-07-01T09:55:22Z

ping :)

ayalz · 2024-07-04T14:14:05Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-  /// the vector type as an output parameter.
-  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
-                                     Type *&VectorTy);
+  InstructionCost getInstructionCost(Instruction *I, ElementCount VF);


Could VectorizationCostTy be retired?

Removed, thanks!

ayalz · 2024-07-04T14:14:41Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -4929,7 +4970,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
      LLVM_DEBUG(dbgs() << ".\n");
 #endif

-      if (!C.second && !ForceVectorization) {
+      if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {


Suggested change

if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {

if (!ForceVectorization && !willGenerateVectorInstructions(*P, VF, TTI)) {

Reordered, thanks!

ayalz · 2024-07-04T16:07:33Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
+                          Plan.getCanonicalIV()->getScalarType()->getContext());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {


Suffice to traverse shallow - inner regions replicate scalar values only - stopping at vector loop region's exit block?

For now yes I think, as replicate regions won't contain any widen recipes.

ayalz · 2024-07-04T16:26:51Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+        continue;
+
+      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);


Time to introduce inferVectorType()?

Added a switch to make sure all recipes are handled; as mentioned above, we need to skip some widen recipes if we want to preserve the existing behavior initially.

ayalz · 2024-07-04T16:29:18Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);
+        Type *VectorTy = ToVectorTy(ScalarTy, VF);
+        unsigned NumParts = TTI.getNumberOfParts(VectorTy);


NumLegalParts or NumRegisters to avoid confusion with unroll parts?

Updated, thanks!

ayalz · 2024-07-04T18:01:41Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
-                        << " for VF " << VF << " For instruction: " << I
-                        << '\n');
+      BlockCost.first += C;


Should BlockCost also become InstructionCost rather than VectorizationCostTy, as its second seems to become undefined and/or unused?

Yes, updated throughout, thanks!

ayalz · 2024-07-04T18:28:27Z

llvm/test/Transforms/LoopVectorize/pr32859.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s


Why is this forcing of VF needed, here and below - default cost- based decision changes?

This won't generate any vector instructions due to dead instructions. I think in this case it makes sense to retain the vectorization behavior by forcing the VF, as from the comment the goal is to guard against creating invalid phis outside the vector loop.

Indeed good to retain behavior. Is the not-generating-any-vectors-due-to-dead-instructions intentional?

I think it makes sense, as there won't be any code at all generated for dead instructions that have been cleaned up by DCE

ayalz · 2024-07-04T18:32:05Z

llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll

-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]


Is this change desirable.

I've undone the change, this now won't get vectorized due to the vector loop body being empty (except canonical IV, increment, compare and branch). It still serves its original purpose (checking type inference for logical AND VPInst)

ayalz · 2024-07-04T19:43:06Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  auto ForcedScalar = ForcedScalars.find(VF);
+  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
+    auto InstSet = ForcedScalar->second;
+    if (InstSet.count(I))
+      return getInstructionCost(I, ElementCount::getFixed(1)) *
+             VF.getKnownMinValue();
+  }


Independent of this patch: if ForcedScalars records its cost, as in InstsToScalarize and WideningDecisions, then ForcedScalars[VF][I] can be returned if exists, similar to InstsToScalarize, rather than invoking a recursive call. Could ForcedScalars be folded into InstsToScalarize.

ayalz · 2024-07-04T19:56:49Z

llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll

@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s

 define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
-;CHECK: vector.body:
+; CHECK-NOT: vector.body:


A more accurate observation if any instruction was vectorized causes LV to avoid (pretending to) vectorizing this loop?

Yep, generated full check lines for the file in 959ff45; there are no vector instructions generated, because the induction is scalarizied, which isn't modeled in the legacy cost model (a similar case is in llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll)

This patch moves the check if any vector instructions will be generated from getInstructionCost to be based on VPlan. This simplifies getInstructionCost, is more accurate as we check the final result and also allows us to exit early once we visit a recipe that generates vector instructions. The helper can then be re-used by the VPlan-based cost model to match the legacy selectVectorizationFactor behavior, this fixing a crash and paving the way to recommit llvm#92555.

Regenerate test checks to better show impact of #96622.

…torInstructions

fhahn

Comments should be addressed now, thanks!

fhahn · 2024-07-04T20:18:19Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-  /// the vector type as an output parameter.
-  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
-                                     Type *&VectorTy);
+  InstructionCost getInstructionCost(Instruction *I, ElementCount VF);


Removed, thanks!

fhahn · 2024-07-04T20:19:07Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -4879,6 +4874,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
  } while (!Tail.empty());
 }

+static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
+                                           const TargetTransformInfo &TTI) {


Added, thanks!

fhahn · 2024-07-04T20:21:38Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
+                          Plan.getCanonicalIV()->getScalarType()->getContext());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {


For now yes I think, as replicate regions won't contain any widen recipes.

fhahn · 2024-07-04T20:37:47Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
+              VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
+              VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
+              VPVectorPointerRecipe>(&R))


Removed VPActiveLaneMaskPHIRecipe, retained VPInstruction to retain existing behavior. Added a comment. (VPVectorPointerRecipe is admittedly named a bit confusingly, it won't generate a vector of pointers, but a scalar pointer to be used for a wide memory access)

fhahn · 2024-07-04T20:38:29Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+        continue;
+
+      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);


Added a switch to make sure all recipes are handled; as mentioned above, we need to skip some widen recipes if we want to preserve the existing behavior initially.

fhahn · 2024-07-04T20:47:43Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -4929,7 +4970,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
      LLVM_DEBUG(dbgs() << ".\n");
 #endif

-      if (!C.second && !ForceVectorization) {
+      if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {


Reordered, thanks!

fhahn · 2024-07-04T20:47:54Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
-                        << " for VF " << VF << " For instruction: " << I
-                        << '\n');
+      BlockCost.first += C;


Yes, updated throughout, thanks!

fhahn · 2024-07-04T20:48:14Z

llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll

@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s

 define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
-;CHECK: vector.body:
+; CHECK-NOT: vector.body:


Yep, generated full check lines for the file in 959ff45; there are no vector instructions generated, because the induction is scalarizied, which isn't modeled in the legacy cost model (a similar case is in llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll)

fhahn · 2024-07-05T10:42:26Z

llvm/test/Transforms/LoopVectorize/pr32859.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s


This won't generate any vector instructions due to dead instructions. I think in this case it makes sense to retain the vectorization behavior by forcing the VF, as from the comment the goal is to guard against creating invalid phis outside the vector loop.

fhahn · 2024-07-05T10:44:14Z

llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll

-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]


I've undone the change, this now won't get vectorized due to the vector loop body being empty (except canonical IV, increment, compare and branch). It still serves its original purpose (checking type inference for logical AND VPInst)

Regenerate test checks to better show impact of llvm#96622.

ayalz · 2024-07-06T09:55:05Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -4795,9 +4783,92 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
  } while (!Tail.empty());
 }

+static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,


Suggested change

static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,

/// Check if any recipe of \p Plan will generate a vector value, which will be assigned a vector register.

static bool willGenerateVectorRegisters(VPlan &Plan, ElementCount VF,

or willGenerateVectors - this not only looks for recipes that will widen, producing vector IR instructions, but also that the vector values they produce will end up utilizing vector registers.

Adjusted, thanks!

ayalz · 2024-07-06T10:03:33Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      }
+
+      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);


Worth caching ScalarTy's already checked?
WillWiden is essentially a function of ScalarTy rather than of VPV, feed it VPV's inferred scalar type instead of the two ingredients to produce it?

Updated, thanks!

ayalz · 2024-07-06T10:09:43Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
+        VPValuesToCheck.push_back(WidenStore->getOperand(1));
+      } else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
+        append_range(VPValuesToCheck, IG->getStoredValues());


This misses values defined by interleaved loads, for which getStoredValues() returns none. That is currently the only case having multiple def's, which are all of the same type - so suffice to check only one.

Another reminder to split VPInterleaveRecipe into interleaved loads and interleaved stores.

Another observation is that VPBranchOnMask should be a VPInstruction, producing no value - similar to BranchOnCond and BranchOnCount VPInstructions.

Fixed by always adding defined values, only add first stored value, thanks!

ayalz · 2024-07-06T10:16:04Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+        return NumLegalParts < VF.getKnownMinValue();
+      };
+      SmallVector<VPValue *> VPValuesToCheck;
+      if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {


or VPWidenStoreEVLRecipe?

Ah yes, it is not yet a subclass of VPStoreRecipe.... Updated, thanks!

ayalz · 2024-07-06T10:17:21Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      };
+      SmallVector<VPValue *> VPValuesToCheck;
+      if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
+        VPValuesToCheck.push_back(WidenStore->getOperand(1));


Suggested change

VPValuesToCheck.push_back(WidenStore->getOperand(1));

VPValuesToCheck.push_back(WidenStore->getStoredValue());

Updated, thanks!

ayalz · 2024-07-06T10:58:21Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+        // Two or more parts that share a register - are vectorized.
+        return NumLegalParts < VF.getKnownMinValue();
+      };
+      SmallVector<VPValue *> VPValuesToCheck;


Another way, say, could be:

Suggested change

SmallVector<VPValue *> VPValuesToCheck;

// For multi-def recipes, currently only interleaved loads, suffice to check first def only.

if (R.getNumDefinedValues() >= 1 && WillWiden(R.getVPValue(0)))

return true;

// For stores check their stored value; for interleaved stores, suffice the check first stored value only. In all cases this is the second operand.

if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(&R) &&

WillWiden(R.getOperand(1)))

return true;

Simplified, thanks!

ayalz · 2024-07-06T11:03:53Z

llvm/test/Transforms/LoopVectorize/pr32859.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s


Indeed good to retain behavior. Is the not-generating-any-vectors-due-to-dead-instructions intentional?

…torInstructions

fhahn

Comments should be addressed, thanks!

fhahn · 2024-07-06T12:02:54Z

llvm/test/Transforms/LoopVectorize/pr32859.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s


I think it makes sense, as there won't be any code at all generated for dead instructions that have been cleaned up by DCE

…torInstructions

ayalz · 2024-07-06T12:25:37Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      // result. Note that this includes VPInstruction, where some opcodes may
+      // produce a vector to preserve existing behavior originally as


Suggested change

// result. Note that this includes VPInstruction, where some opcodes may

// produce a vector to preserve existing behavior originally as

// result. Note that this includes VPInstruction where some opcodes may

// produce a vector, to preserve existing behavior as

Adjusted, thanks!

ayalz · 2024-07-06T12:38:47Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  assert(VF.isVector() && "Checking a scalar VF?");
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
+                          Plan.getCanonicalIV()->getScalarType()->getContext());
+  DenseMap<Type *, bool> GeneratesVector;


Suffice to hold a set of all visited scalars instead? Only scalars are cached possibly ending with the first vector found - whose insertion is redundant as it early exits the search.

Updated, thanks!

ayalz · 2024-07-06T12:40:01Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      auto WillWiden = [&TypeInfo, &TTI, &GeneratesVector, VF](VPValue *VPV) {
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);
+        const auto &[Iter, Ins] = GeneratesVector.insert({ScalarTy, false});
+        if (Ins) {


Early return if in cache?

Updated, thanks!

ayalz · 2024-07-06T21:34:21Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      if (R.getNumDefinedValues() >= 1) {
+        // For multi-def recipes, currently only interleaved loads, suffice to
+        // check first def only.
+        if (WillWiden(R.getVPValue(0)))
+          return true;
+      } else if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe,
+                     VPInterleaveRecipe>(&R) &&
+                 WillWiden(R.getOperand(1))) {
+        // For stores check their stored value; for interleaved stores, suffice
+        // the check first stored value only. In all cases this is the second
+        // operand.
+        return true;
+      }


Trying to see if WillWiden() can be simplified to only check for widening - how about the following (may be too compact, chose which style you prefer):

Suggested change

if (R.getNumDefinedValues() >= 1) {

// For multi-def recipes, currently only interleaved loads, suffice to

// check first def only.

if (WillWiden(R.getVPValue(0)))

return true;

} else if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe,

VPInterleaveRecipe>(&R) &&

WillWiden(R.getOperand(1))) {

// For stores check their stored value; for interleaved stores, suffice

// the check first stored value only. In all cases this is the second

// operand.

return true;

}

// If no def nor is a store, e.g., branches, continue - no value to check.

if (R.getNumDefinedValues() == 0 &&

!isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(&R))

continue;

// For multi-def recipes, currently only interleaved loads, suffice to

// check first def only.

// For stores check their stored value; for interleaved stores suffice

// the check first stored value only. In all cases this is the second

// operand.

VPValue *ToCheck = R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);

Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);

if (WillNotWiden.contains(ScalarTy))

continue;

if (WillWiden(ScalarTy))

return true;

WillNotWiden.insert(ScalarTy);

Adjusted, thanks!

ayalz · 2024-07-06T21:39:42Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+        if (Ins) {
+          Type *VectorTy = ToVectorTy(ScalarTy, VF);
+          unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
+          if (!NumLegalParts)


This could also be cached.

It could be cached, but in the latest version, we only visit each scalar type at most once. Left as is for now.

…torInstructions

ayalz

This LGTM, thanks for accommodating! Adding last couple of minor comments.

ayalz · 2024-07-07T11:54:13Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  // Set of types known to not generate vector values.
+  DenseSet<Type *> WillNotWiden;


Suggested change

// Set of types known to not generate vector values.

DenseSet<Type *> WillNotWiden;

// Set of types already visited.

DenseSet<Type *> Visited;

Renamed, thanks!

ayalz · 2024-07-07T11:57:30Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      VPValue *ToCheck =
+          R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
+      Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
+      if (!WillNotWiden.insert({ScalarTy}).second)


Inserting before checking will include will-not-widen types along with the first will-widen type (if one exists), so slightly more accurate to call the cache Visited than WillNotWiden:

Suggested change

if (!WillNotWiden.insert({ScalarTy}).second)

if (!Visited.insert({ScalarTy}).second)

Renamed, thanks!

ayalz · 2024-07-07T12:09:55Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      // check first def only.
+      // For stores check their stored value; for interleaved stores suffice
+      // the check first stored value only. In all cases this is the second
+      // operand.


Ah, the stored widened values are presumably defined earlier, inside the vector loop, so would it suffice to check the types of widened values defined by widening recipes only, and ignore checking those used by stores?

I think that would potentially miss cases where we have wide stores of loop-invariant values only, left as is for now.!

…torInstructions

llvm-ci · 2024-07-07T19:59:40Z

LLVM Buildbot has detected a new failure on builder clang-s390x-linux running on systemz-1 while building llvm at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/42/builds/280

Here is the relevant piece of the build log for the reference:

Step 5 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'libFuzzer-s390x-default-Linux :: fuzzer-timeout.test' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 1: /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/./bin/clang    -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta   --driver-mode=g++ -O2 -gline-tables-only -fsanitize=address,fuzzer -I/home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/lib/fuzzer  /home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/test/fuzzer/TimeoutTest.cpp -o /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/runtimes/runtimes-bins/compiler-rt/test/fuzzer/S390XDefaultLinuxConfig/Output/fuzzer-timeout.test.tmp-TimeoutTest
+ /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/./bin/clang -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta --driver-mode=g++ -O2 -gline-tables-only -fsanitize=address,fuzzer -I/home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/lib/fuzzer /home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/test/fuzzer/TimeoutTest.cpp -o /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/runtimes/runtimes-bins/compiler-rt/test/fuzzer/S390XDefaultLinuxConfig/Output/fuzzer-timeout.test.tmp-TimeoutTest
RUN: at line 2: /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/./bin/clang    -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta   --driver-mode=g++ -O2 -gline-tables-only -fsanitize=address,fuzzer -I/home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/lib/fuzzer  /home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/test/fuzzer/TimeoutEmptyTest.cpp -o /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/runtimes/runtimes-bins/compiler-rt/test/fuzzer/S390XDefaultLinuxConfig/Output/fuzzer-timeout.test.tmp-TimeoutEmptyTest
+ /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/./bin/clang -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta --driver-mode=g++ -O2 -gline-tables-only -fsanitize=address,fuzzer -I/home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/lib/fuzzer /home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/test/fuzzer/TimeoutEmptyTest.cpp -o /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/runtimes/runtimes-bins/compiler-rt/test/fuzzer/S390XDefaultLinuxConfig/Output/fuzzer-timeout.test.tmp-TimeoutEmptyTest
RUN: at line 3: not  /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/runtimes/runtimes-bins/compiler-rt/test/fuzzer/S390XDefaultLinuxConfig/Output/fuzzer-timeout.test.tmp-TimeoutTest -timeout=1 2>&1 | FileCheck /home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/test/fuzzer/fuzzer-timeout.test --check-prefix=TimeoutTest
+ not /home/uweigand/sandbox/buildbot/clang-s390x-linux/stage1/runtimes/runtimes-bins/compiler-rt/test/fuzzer/S390XDefaultLinuxConfig/Output/fuzzer-timeout.test.tmp-TimeoutTest -timeout=1
+ FileCheck /home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/test/fuzzer/fuzzer-timeout.test --check-prefix=TimeoutTest
/home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/test/fuzzer/fuzzer-timeout.test:7:14: error: TimeoutTest: expected string not found in input
TimeoutTest: #0
             ^
<stdin>:19:44: note: scanning from here
==3848847== ERROR: libFuzzer: timeout after 1 seconds
                                           ^
<stdin>:24:104: note: possible intended match here
AddressSanitizer: CHECK failed: asan_report.cpp:199 "((current_error_.kind)) == ((kErrorKindInvalid))" (0x1, 0x0) (tid=3848847)
                                                                                                       ^

Input file: <stdin>
Check file: /home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/compiler-rt/test/fuzzer/fuzzer-timeout.test

-dump-input=help explains the following input dump.

Input was:
<<<<<<
           .
           .
           .
          14: MS: 1 InsertByte-; base unit: 94dd9e08c129c785f7f256e82fbe0a30e6d1ae40 
          15: 0x48,0x69,0x21, 
          16: Hi! 
          17: artifact_prefix='./'; Test unit written to ./timeout-c0a0ad26a634840c67a210fefdda76577b03a111 
          18: Base64: SGkh 
          19: ==3848847== ERROR: libFuzzer: timeout after 1 seconds 
check:7'0                                                X~~~~~~~~~~ error: no match found
          20: AddressSanitizer:DEADLYSIGNAL 
check:7'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          21: ================================================================= 
check:7'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          22: AddressSanitizer:DEADLYSIGNAL 
check:7'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          23: ================================================================= 
check:7'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          24: AddressSanitizer: CHECK failed: asan_report.cpp:199 "((current_error_.kind)) == ((kErrorKindInvalid))" (0x1, 0x0) (tid=3848847) 
check:7'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
check:7'1                                                                                                            ?                         possible intended match
...

fhahn · 2024-07-10T08:43:37Z

While testing the VPlan cost model changes, I found a case where we vectorized with this change but not without it. This was due to the VPlan based analysis not skipping ephemeral values (such as values only used by asserts), leading to vectorization when the only widened values are used by asserts only. Should be fixed by ef89e3e

@arcbbb

This reverts commit 6f538f6. A number of crashes have been fixed by separate fixes, including ttps://github.com//pull/96622. This version of the PR also pre-computes the costs for branches (except the latch) instead of computing their costs as part of costing of replicate regions, as there may not be a direct correspondence between original branches and number of replicate regions. Original message: This adds a new interface to compute the cost of recipes, VPBasicBlocks, VPRegionBlocks and VPlan, initially falling back to the legacy cost model for all recipes. Follow-up patches will gradually migrate recipes to compute their own costs step-by-step. It also adds getBestPlan function to LVP which computes the cost of all VPlans and picks the most profitable one together with the most profitable VF. The VPlan selected by the VPlan cost model is executed and there is an assert to catch cases where the VPlan cost model and the legacy cost model disagree. Even though I checked a number of different build configurations on AArch64 and X86, there may be some differences that have been missed. Additional discussions and context can be found in @arcbbb's #67647 and #67934 which is an earlier version of the current PR. PR: #92555

@arcbbb

This reverts commit 6f538f6. A number of crashes have been fixed by separate fixes, including ttps://github.com/llvm/pull/96622. This version of the PR also pre-computes the costs for branches (except the latch) instead of computing their costs as part of costing of replicate regions, as there may not be a direct correspondence between original branches and number of replicate regions. Original message: This adds a new interface to compute the cost of recipes, VPBasicBlocks, VPRegionBlocks and VPlan, initially falling back to the legacy cost model for all recipes. Follow-up patches will gradually migrate recipes to compute their own costs step-by-step. It also adds getBestPlan function to LVP which computes the cost of all VPlans and picks the most profitable one together with the most profitable VF. The VPlan selected by the VPlan cost model is executed and there is an assert to catch cases where the VPlan cost model and the legacy cost model disagree. Even though I checked a number of different build configurations on AArch64 and X86, there may be some differences that have been missed. Additional discussions and context can be found in @arcbbb's llvm#67647 and llvm#67934 which is an earlier version of the current PR. PR: llvm#92555

fhahn requested review from rengolin, ayalz and aniragil June 25, 2024 11:16

llvmbot added backend:SystemZ vectorizers llvm:transforms labels Jun 25, 2024

ayalz reviewed Jul 4, 2024

View reviewed changes

fhahn added 2 commits July 5, 2024 09:01

!fixup address latest comments, thanks!

0789f5d

fhahn force-pushed the vplan-willGenerateVectorInstructions branch from 14d39c0 to 0789f5d Compare July 5, 2024 08:01

fhahn added a commit that referenced this pull request Jul 5, 2024

[LV] Regenerate test checks for zero_unroll.ll (NFC).

959ff45

Regenerate test checks to better show impact of #96622.

fhahn added 2 commits July 5, 2024 11:38

!fixup

9951ec2

Merge remote-tracking branch 'origin/main' into vplan-willGenerateVec…

2099a8f

…torInstructions

fhahn commented Jul 5, 2024

View reviewed changes

kbluck pushed a commit to kbluck/llvm-project that referenced this pull request Jul 6, 2024

[LV] Regenerate test checks for zero_unroll.ll (NFC).

737dc0a

Regenerate test checks to better show impact of llvm#96622.

ayalz reviewed Jul 6, 2024

View reviewed changes

fhahn added 2 commits July 6, 2024 12:34

Merge remote-tracking branch 'origin/main' into vplan-willGenerateVec…

a0d5050

…torInstructions

!fixup address latest comments

1cce5c4

fhahn commented Jul 6, 2024

View reviewed changes

fhahn added 2 commits July 6, 2024 20:35

Merge remote-tracking branch 'origin/main' into vplan-willGenerateVec…

c321381

…torInstructions

!fixup don't check store recipes for recipes with defined values.

cc437d8

ayalz reviewed Jul 6, 2024

View reviewed changes

fhahn added 2 commits July 7, 2024 12:06

Merge remote-tracking branch 'origin/main' into vplan-willGenerateVec…

a5128c5

…torInstructions

!fixup address latest comments, thanks!

7835ae0

ayalz approved these changes Jul 7, 2024

View reviewed changes

fhahn added 2 commits July 7, 2024 18:31

!fixup Rename WillNotWiden -> Visited.

3b78232

Merge remote-tracking branch 'origin/main' into vplan-willGenerateVec…

da1ae93

…torInstructions

fhahn merged commit 29b8b72 into llvm:main Jul 7, 2024
7 checks passed

fhahn deleted the vplan-willGenerateVectorInstructions branch July 7, 2024 19:08

fhahn mentioned this pull request Jul 10, 2024

[VPlan] First step towards VPlan cost modeling. #92555

Merged

	if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
	if (!ForceVectorization && !willGenerateVectorInstructions(*P, VF, TTI)) {

		@@ -1,4 +1,4 @@
		; RUN: opt < %s -passes=loop-vectorize -S \| FileCheck %s
		; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S \| FileCheck %s

	static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
	/// Check if any recipe of \p Plan will generate a vector value, which will be assigned a vector register.
	static bool willGenerateVectorRegisters(VPlan &Plan, ElementCount VF,

	VPValuesToCheck.push_back(WidenStore->getOperand(1));
	VPValuesToCheck.push_back(WidenStore->getStoredValue());

		// result. Note that this includes VPInstruction, where some opcodes may
		// produce a vector to preserve existing behavior originally as

		// Set of types known to not generate vector values.
		DenseSet<Type *> WillNotWiden;

	if (!WillNotWiden.insert({ScalarTy}).second)
	if (!Visited.insert({ScalarTy}).second)

[LV] Move check if any vector insts will be generated to VPlan. #96622

[LV] Move check if any vector insts will be generated to VPlan. #96622

Conversation

fhahn commented Jun 25, 2024

llvmbot commented Jun 25, 2024 • edited Loading

fhahn commented Jul 1, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

fhahn left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

fhahn left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ayalz left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

llvm-ci commented Jul 7, 2024

fhahn commented Jul 10, 2024

llvmbot commented Jun 25, 2024 •

edited

Loading