From 16d48f16ec520a07a21fcc35a41f3781874ab40f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 4 May 2024 20:53:53 +0100 Subject: [PATCH] [LV,LAA] Don't vectorize loops with load and store to invar address. Code checking stores to invariant addresses and reductions made an incorrect assumption that the case of both a load & store to the same invariant address does not need to be handled. In some cases when vectorizing with runtime checks, there may be dependences with a load and store to the same address, storing a reduction value. Update LAA to separately track if there was a store-store and a load-store dependence with an invariant addresses. Bail out early if there as a load-store dependence with invariant address. If there was a store-store one, still apply the logic checking if they all store a reduction. (cherry picked from commit b54a78d69be1952884462cb897abb9cf60a33978) --- .../llvm/Analysis/LoopAccessAnalysis.h | 28 +++++++++++---- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 14 +++++--- .../Vectorize/LoopVectorizationLegality.cpp | 16 ++++++--- .../reduction-with-invariant-store.ll | 34 +++++++++++++++++++ 4 files changed, 76 insertions(+), 16 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index e39c371b41ec5c..1d67a71f43edde 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -579,7 +579,11 @@ class LoopAccessInfo { AAResults *AA, DominatorTree *DT, LoopInfo *LI); /// Return true we can analyze the memory accesses in the loop and there are - /// no memory dependence cycles. + /// no memory dependence cycles. Note that for dependences between loads & + /// stores with uniform addresses, + /// hasStoreStoreDependenceInvolvingLoopInvariantAddress and + /// hasLoadStoreDependenceInvolvingLoopInvariantAddress also need to be + /// checked. bool canVectorizeMemory() const { return CanVecMem; } /// Return true if there is a convergent operation in the loop. There may @@ -632,10 +636,16 @@ class LoopAccessInfo { /// Print the information about the memory accesses in the loop. void print(raw_ostream &OS, unsigned Depth = 0) const; - /// If the loop has memory dependence involving an invariant address, i.e. two - /// stores or a store and a load, then return true, else return false. - bool hasDependenceInvolvingLoopInvariantAddress() const { - return HasDependenceInvolvingLoopInvariantAddress; + /// Return true if the loop has memory dependence involving two stores to an + /// invariant address, else return false. + bool hasStoreStoreDependenceInvolvingLoopInvariantAddress() const { + return HasStoreStoreDependenceInvolvingLoopInvariantAddress; + } + + /// Return true if the loop has memory dependence involving a load and a store + /// to an invariant address, else return false. + bool hasLoadStoreDependenceInvolvingLoopInvariantAddress() const { + return HasLoadStoreDependenceInvolvingLoopInvariantAddress; } /// Return the list of stores to invariant addresses. @@ -697,8 +707,12 @@ class LoopAccessInfo { bool CanVecMem = false; bool HasConvergentOp = false; - /// Indicator that there are non vectorizable stores to a uniform address. - bool HasDependenceInvolvingLoopInvariantAddress = false; + /// Indicator that there are two non vectorizable stores to the same uniform + /// address. + bool HasStoreStoreDependenceInvolvingLoopInvariantAddress = false; + /// Indicator that there is non vectorizable load and store to the same + /// uniform address. + bool HasLoadStoreDependenceInvolvingLoopInvariantAddress = false; /// List of stores to invariant addresses. SmallVector StoresToInvariantAddresses; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index dd6b88fee415a7..fc9e82015e44f2 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2465,7 +2465,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, if (isInvariant(Ptr)) { // Record store instructions to loop invariant addresses StoresToInvariantAddresses.push_back(ST); - HasDependenceInvolvingLoopInvariantAddress |= + HasStoreStoreDependenceInvolvingLoopInvariantAddress |= !UniformStores.insert(Ptr).second; } @@ -2521,7 +2521,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, if (UniformStores.count(Ptr)) { LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " "load and uniform store to the same address!\n"); - HasDependenceInvolvingLoopInvariantAddress = true; + HasLoadStoreDependenceInvolvingLoopInvariantAddress = true; } MemoryLocation Loc = MemoryLocation::get(LD); @@ -2985,9 +2985,13 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { PtrRtChecking->print(OS, Depth); OS << "\n"; - OS.indent(Depth) << "Non vectorizable stores to invariant address were " - << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ") - << "found in loop.\n"; + OS.indent(Depth) + << "Non vectorizable stores to invariant address were " + << (HasStoreStoreDependenceInvolvingLoopInvariantAddress || + HasLoadStoreDependenceInvolvingLoopInvariantAddress + ? "" + : "not ") + << "found in loop.\n"; OS.indent(Depth) << "SCEV assumptions:\n"; PSE->getPredicate().print(OS, Depth); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 37a356c43e29a4..cfd0c1b5e592dc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1067,6 +1067,15 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (!LAI->canVectorizeMemory()) return false; + if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { + reportVectorizationFailure("We don't allow storing to uniform addresses", + "write to a loop invariant address could not " + "be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, + TheLoop); + return false; + } + // We can vectorize stores to invariant address when final reduction value is // guaranteed to be stored at the end of the loop. Also, if decision to // vectorize loop is made, runtime checks are added so as to make sure that @@ -1102,13 +1111,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } } - if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { + if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) { // For each invariant address, check its last stored value is the result // of one of our reductions. // - // We do not check if dependence with loads exists because they are - // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this - // behaviour changes we have to modify this code. + // We do not check if dependence with loads exists because that is already + // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress. ScalarEvolution *SE = PSE.getSE(); SmallVector UnhandledStores; for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll index 5584aa969367ac..d5074f6a3b0827 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -118,6 +118,40 @@ exit: ret void } +; Check that if we have a read from an invariant address, we do not vectorize, +; even if we vectorize with runtime checks. The test below is a variant of +; @reduc_store_load with a non-constant dependence distance, resulting in +; vectorization with runtime checks. +; +; CHECK-LABEL: @reduc_store_load_with_non_constant_distance_dependence +; CHECK-NOT: vector.body: +define void @reduc_store_load_with_non_constant_distance_dependence(ptr %dst, ptr noalias %dst.2, i64 %off) { +entry: + %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 + %dst.2.off = getelementptr inbounds i32, ptr %dst.2, i64 %off + store i32 0, ptr %gep.dst, align 4 + br label %for.body + +for.body: + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.src = getelementptr inbounds i32, ptr %dst.2, i64 %iv + %0 = load i32, ptr %gep.src, align 4 + %iv.off = mul i64 %iv, 2 + %add = add nsw i32 %sum, %0 + %lv = load i32, ptr %gep.dst + store i32 %add, ptr %gep.dst, align 4 + %gep.src.2 = getelementptr inbounds i32, ptr %dst.2.off, i64 %iv + store i32 %lv, ptr %gep.src.2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %exit, label %for.body + +exit: + ret void +} + + ; Final value is not guaranteed to be stored in an invariant address. ; We don't vectorize in that case. ;