[EraVM] Add support for splitting live ranges of PHI nodes in loops

This is useful for loops with large switch statements where PHI nodes are used frequently, and we want to keep these variables in a single register. Signed-off-by: Vladimir Radosavljevic <[email protected]>
matter-labs · Sep 25, 2024 · 4c59238 · 4c59238
1 parent 72097e2
commit 4c59238
Show file tree

Hide file tree

Showing 3 changed files with 258 additions and 10 deletions.
diff --git a/llvm/lib/Target/EraVM/EraVMPostCodegenPrepare.cpp b/llvm/lib/Target/EraVM/EraVMPostCodegenPrepare.cpp
@@ -19,9 +19,12 @@
 
 #include "EraVM.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -30,6 +33,14 @@ using namespace llvm::PatternMatch;
 #define ERAVM_POST_CODEGEN_PREPARE_NAME                                        \
   "EraVM optimizations after CodeGenPrepare pass"
 
+static cl::opt<bool> EnableSplitLoopPHILiveRanges(
+    "eravm-enable-split-loop-phi-live-ranges", cl::Hidden, cl::init(true),
+    cl::desc("Enable splitting live ranges of PHI nodes in loops"));
+
+static cl::opt<unsigned> NumOfPHIUsesToSplitLiveRanges(
+    "eravm-num-of-phi-uses-to-split-live-ranges", cl::Hidden, cl::init(20),
+    cl::desc("Number of uses of PHI node to consider splitting live ranges"));
+
 namespace {
 struct EraVMPostCodegenPrepare : public FunctionPass {
 public:
@@ -44,6 +55,9 @@ struct EraVMPostCodegenPrepare : public FunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -303,7 +317,229 @@ static bool rearrangeOverflowHandlingBranches(Function &F) {
   return Changed;
 }
 
-static bool runImpl(Function &F) {
+struct InstSplitInfo {
+  using InstInfo = std::pair<Instruction *, APInt>;
+  // Vector of instructions with their original constant operand.
+  SmallVector<InstInfo, 4> Insts;
+  // Index of the last instruction that will be used to split the live range
+  // in the dominatee blocks.
+  uint64_t LastSplitIdx = UINT64_MAX;
+
+  // Add instruction and update the last split index if needed.
+  void addInst(Instruction *I) {
+    assert(isa<ConstantInt>(I->getOperand(1)) && "Expected constant operand");
+
+    // Update the index if this instruction is closer to the end of the block.
+    if (LastSplitIdx == UINT64_MAX ||
+        !I->comesBefore(Insts[LastSplitIdx].first))
+      LastSplitIdx = Insts.size();
+
+    Insts.push_back({I, cast<ConstantInt>(I->getOperand(1))->getValue()});
+  }
+
+  // Get the last instruction that will be used to split the live range.
+  InstInfo &getLastSplitInst() {
+    assert(LastSplitIdx < Insts.size() && "Expected valid index");
+    return Insts[LastSplitIdx];
+  }
+};
+
+// Update operands of instruction in InstInfo with the instruction in
+// DomInstInfo. Basically, we are doing the following transformation:
+//   %Dominator = add %In, DomImm
+//   %I         = add %In, Imm
+// to
+//   %Dominator = add %In, DomImm
+//   %I         = add %Dominator, Imm - DomImm
+void updateInstOperands(InstSplitInfo::InstInfo &InstInfo,
+                        InstSplitInfo::InstInfo &DomInstInfo,
+                        const DominatorTree &DT) {
+  auto [Inst, Imm] = InstInfo;
+  auto [DomInst, DomImm] = DomInstInfo;
+  assert(DT.dominates(DomInst, Inst) && "Expected dominator instruction");
+
+  Inst->setOperand(0, DomInst);
+  Inst->setOperand(1, ConstantInt::get(Inst->getType(), Imm - DomImm));
+
+  // TODO: Relax this check, since we don't need to drop poison
+  // flags in all cases.
+  Inst->dropPoisonGeneratingFlags();
+}
+
+// This function splits PHI nodes live ranges, if users are add instructions
+// with a constant operand. This is useful for loops with large switch
+// statements where PHI nodes are used frequently, and we want to keep these
+// variables in a single register.
+// In case regalloc is not able to keep these variables in a single register,
+// we will get something like this in all cases of the switch where variable
+// is used:
+//   preheader:
+//     %r1 = def
+//   header:
+//     jump @JTI
+//   ...
+//   bb1:
+//     %r2 = add %r1, 1
+//     bcc bb2
+//   bb2:
+//     %r1 = copy %r2 <- regalloc is not able to keep variable in the same reg
+//     b latch
+//   ...
+//   latch:
+//     bcc header
+//
+// Ideally, we would like to have something like this:
+//   preheader:
+//     %r1 = def
+//   header:
+//     jump @JTI
+//   ...
+//   bb1:
+//     %r1 = add %r1, 1 <- regalloc managed to keep variable in the same reg
+//     bcc bb2
+//   bb2:
+//                      <- no need for copy instruction
+//     b latch
+//   ...
+//   latch:
+//     bcc header
+//
+// To help regalloc to try to preserve frequently used PHI nodes in a single
+// register we are finding add instructions with constant operands that are
+// users of the PHI, and changing first operand of the add instruction to the
+// nearest dominating add instruction while updating the constant operand. This
+// way, regalloc will have a better chance to keep the variable in the same
+// register, since we changed the intervals of the variable.
+// For example, we are transforming this:
+//   header:
+//     %phi = phi
+//     ...
+//   bb1:
+//     %add1 = add %phi, 64
+//     ...
+//     %add2 = add %phi, -64
+//     bcc bb2
+//   bb2:
+//     %add3 = add %phi, -32
+//     ...
+//     %add4 = add %phi, -96
+//     ...
+//     b latch
+//   latch:
+//     bcc header
+//
+// To this (where add3 and add4 are updated with the nearest dominating add,
+// which is add2):
+//   header:
+//     %phi = phi
+//     ...
+//   bb1:
+//     %add1 = add %phi, 64
+//     ...
+//     %add2 = add %phi, -64
+//     bcc bb2
+//   bb2:
+//     %add3 = add %add2, 32
+//     ...
+//     %add4 = add %add2, -32
+//     ...
+//     b latch
+//   latch:
+//     bcc header
+//
+// In order to do so, we are doing the following steps:
+//   1. Find all users of the PHI node in the loop that are add instructions
+//      with constant operands, and updating the index of the instruction that
+//      is closer to the end of the block. This instruction will be used to
+//      split ranges in the dominatee blocks.
+//   2. For each block, we are finding the nearest dominator block from which
+//      we can split the live range.
+//   3. Update the instructions in the block with the nearest dominator by
+//      changing the first operands to the dominator instruction and updating
+//      the constant operands.
+static bool splitPHILiveRange(PHINode &Phi, const LoopInfo &LI, const Loop &L,
+                              const DominatorTree &DT) {
+  assert(Phi.getParent() == L.getHeader() &&
+         "Expected PHI node in a loop header");
+  DomTreeNode *LoopHeaderNode = DT.getNode(L.getHeader());
+  if (!LoopHeaderNode)
+    return false;
+
+  DenseMap<BasicBlock *, InstSplitInfo> Splits;
+  for (auto *U : Phi.users()) {
+    // Only collect add instructions with constant operands.
+    auto *UI = cast<Instruction>(U);
+    if (!UI || UI->getOpcode() != Instruction::Add ||
+        !isa<ConstantInt>(UI->getOperand(1)) ||
+        LI.getLoopFor(UI->getParent()) != &L)
+      continue;
+    Splits[UI->getParent()].addInst(UI);
+  }
+
+  // If there are no at least two blocks, we can't split the live range,
+  // since we need dominator and dominatee blocks to do so.
+  if (Splits.size() < 2)
+    return false;
+
+  // Split ranges across blocks. This is done by finding the nearest
+  // dominator block, from which we can split the live range.
+  bool Changed = false;
+  for (auto &[BB, Infos] : Splits) {
+    DomTreeNode *Node = DT.getNode(BB);
+    if (!Node)
+      continue;
+
+    // Find the nearest dominator, from which we can split the live range.
+    InstSplitInfo::InstInfo *NearestDominator = nullptr;
+    while ((Node = Node->getIDom())) {
+      auto I = Splits.find(Node->getBlock());
+      if (I != Splits.end()) {
+        // We found the nearest dominator block, so take the last
+        // instruction from it.
+        NearestDominator = &I->second.getLastSplitInst();
+        break;
+      }
+
+      // Bail out if we reached the start of the loop.
+      if (Node == LoopHeaderNode)
+        break;
+    }
+
+    // If we didn't find any dominator, skip this BB.
+    if (!NearestDominator)
+      continue;
+
+    // Update instructions in the block with the nearest dominator.
+    for (auto &Info : Infos.Insts)
+      updateInstOperands(Info, *NearestDominator, DT);
+
+    // TODO: Relax this check, since we don't need to drop poison
+    // flags in all cases.
+    NearestDominator->first->dropPoisonGeneratingFlags();
+    Changed = true;
+  }
+  return Changed;
+}
+
+// This optimization tries to split live ranges of PHI nodes in a loop,
+// with a large number of users.
+static bool splitLoopPHILiveRanges(Function &F, LoopInfo &LI,
+                                   DominatorTree &DT) {
+  if (!EnableSplitLoopPHILiveRanges)
+    return false;
+
+  bool Changed = false;
+  for (auto *L : LI) {
+    for (auto &Phi : L->getHeader()->phis()) {
+      if (Phi.getNumUses() <= NumOfPHIUsesToSplitLiveRanges)
+        continue;
+      Changed |= splitPHILiveRange(Phi, LI, *L, DT);
+    }
+  }
+  return Changed;
+}
+
+static bool runImpl(Function &F, LoopInfo &LI, DominatorTree &DT) {
   bool Changed = false;
   for (auto &BB : F) {
     for (auto &I : llvm::make_early_inc_range(BB)) {
@@ -320,28 +556,38 @@ static bool runImpl(Function &F) {
     }
   }
 
+  Changed |= splitLoopPHILiveRanges(F, LI, DT);
   Changed |= rearrangeOverflowHandlingBranches(F);
   return Changed;
 }
 
 bool EraVMPostCodegenPrepare::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
-  return runImpl(F);
+
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return runImpl(F, LI, DT);
 }
 
 char EraVMPostCodegenPrepare::ID = 0;
 
-INITIALIZE_PASS(EraVMPostCodegenPrepare, DEBUG_TYPE,
-                ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
+INITIALIZE_PASS_BEGIN(EraVMPostCodegenPrepare, DEBUG_TYPE,
+                      ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(EraVMPostCodegenPrepare, DEBUG_TYPE,
+                    ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
 
 FunctionPass *llvm::createEraVMPostCodegenPreparePass() {
   return new EraVMPostCodegenPrepare();
 }
 
 PreservedAnalyses
 EraVMPostCodegenPreparePass::run(Function &F, FunctionAnalysisManager &AM) {
-  if (runImpl(F))
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (runImpl(F, LI, DT))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/CodeGen/EraVM/O3-pipeline.ll b/llvm/test/CodeGen/EraVM/O3-pipeline.ll
@@ -90,6 +90,8 @@ target triple = "eravm"
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       TLS Variable Hoist
 ; CHECK-NEXT:       CodeGen Prepare
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       EraVM optimizations after CodeGenPrepare pass
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass

diff --git a/llvm/test/CodeGen/EraVM/split-loop-phi-live-ranges.ll b/llvm/test/CodeGen/EraVM/split-loop-phi-live-ranges.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -passes=eravm-post-codegen-prepare -S < %s | FileCheck %s
+; RUN: opt -passes=eravm-post-codegen-prepare -eravm-num-of-phi-uses-to-split-live-ranges=2 -S < %s | FileCheck %s
 
 target datalayout = "E-p:256:256-i256:256:256-S32-a:256:256"
 target triple = "eravm"
@@ -29,19 +29,19 @@ define i256 @test() {
 ; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i256 [[RESULT]], 64
 ; CHECK-NEXT:    [[INTTOPTR0:%.*]] = inttoptr i256 [[ADD2]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR0]], align 1
-; CHECK-NEXT:    [[ADD3:%.*]] = add nuw nsw i256 [[RESULT]], -64
+; CHECK-NEXT:    [[ADD3:%.*]] = add i256 [[RESULT]], -64
 ; CHECK-NEXT:    [[INTTOPTR1:%.*]] = inttoptr i256 [[ADD3]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR1]], align 1
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i256 [[RESULT]], 1000
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[CASE2_BB1:%.*]], label [[EXIT]]
 ; CHECK:       case2_bb1:
-; CHECK-NEXT:    [[ADD4:%.*]] = add nuw nsw i256 [[RESULT]], -32
+; CHECK-NEXT:    [[ADD4:%.*]] = add i256 [[ADD3]], 32
 ; CHECK-NEXT:    [[INTTOPTR2:%.*]] = inttoptr i256 [[ADD4]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR2]], align 1
-; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i256 [[RESULT]], -96
+; CHECK-NEXT:    [[ADD5:%.*]] = add i256 [[ADD3]], -32
 ; CHECK-NEXT:    [[INTTOPTR3:%.*]] = inttoptr i256 [[ADD5]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR3]], align 1
-; CHECK-NEXT:    [[ADD6:%.*]] = add nuw nsw i256 [[RESULT]], 128
+; CHECK-NEXT:    [[ADD6:%.*]] = add i256 [[ADD3]], 192
 ; CHECK-NEXT:    [[INTTOPTR4:%.*]] = inttoptr i256 [[ADD6]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR4]], align 1
 ; CHECK-NEXT:    br label [[INCREMENT]]