From 35b9da26b0f4547d2515a88bfcd02c15d15f2cea Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Wed, 24 Apr 2024 11:26:23 -0700 Subject: [PATCH] [MachineOutliner][CGData] Global Outlining This commit introduces support for outlining functions across modules using codegen data generated from previous codegen. The codegen data currently manages the outlined hash tree, which records outlining instances that occurred locally in the past. The machine outliner now operates in one of three modes: 1. CGDataMode::None: This is the default outliner mode that uses the suffix tree to identify (local) outlining candidates within a module. This mode is also used by (full)LTO to maintain optimal behavior with the combined module. 2. CGDataMode::Write (`codegen-data-generate`): This mode is identical to the default mode, but it also publishes the stable hash sequences of instructions in the outlined functions into a local outlined hash tree. It then encodes this into the `__llvm_outline` section, which will be dead-stripped at link time. 3. CGDataMode::Read (`codegen-data-use-path={.cgdata}`): This mode reads a codegen data file (.cgdata) and initializes a global outlined hash tree. This tree is used to generate global outlining candidates. Note that the codegen data file has been post-processed with the raw `__llvm_outline` sections from all native objects using the `llvm-cgdata` tool (or a linker, `LLD`, or a new ThinLTO pipeline later). --- llvm/include/llvm/CodeGen/MachineOutliner.h | 39 +++ llvm/lib/CodeGen/CMakeLists.txt | 1 + llvm/lib/CodeGen/MachineOutliner.cpp | 242 +++++++++++++++++- llvm/lib/CodeGen/MachineStableHash.cpp | 14 +- llvm/lib/CodeGenData/CodeGenData.cpp | 26 +- llvm/test/CodeGen/AArch64/O3-pipeline.ll | 1 + .../CodeGen/AArch64/cgdata-global-hash.ll | 40 +++ .../AArch64/cgdata-read-double-outline.ll | 57 +++++ .../AArch64/cgdata-read-lto-outline.ll | 94 +++++++ .../CodeGen/AArch64/cgdata-read-priority.ll | 68 +++++ .../AArch64/cgdata-read-single-outline.ll | 42 +++ .../CodeGen/AArch64/cgdata-write-outline.ll | 51 ++++ llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 + 13 files changed, 671 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/cgdata-global-hash.ll create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-priority.ll create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll create mode 100644 llvm/test/CodeGen/AArch64/cgdata-write-outline.ll diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h index 84937a8b563ac0..5a8bae744ed9ab 100644 --- a/llvm/include/llvm/CodeGen/MachineOutliner.h +++ b/llvm/include/llvm/CodeGen/MachineOutliner.h @@ -18,6 +18,7 @@ #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineStableHash.h" #include namespace llvm { @@ -233,6 +234,9 @@ struct OutlinedFunction { /// Target-defined identifier for constructing a frame for this function. unsigned FrameConstructionID = 0; + /// The sequence of stable_hash'es of instructions. + std::vector OutlinedHashSequence; + /// Return the number of candidates for this \p OutlinedFunction. virtual unsigned getOccurrenceCount() const { return Candidates.size(); } @@ -274,6 +278,41 @@ struct OutlinedFunction { OutlinedFunction() = delete; virtual ~OutlinedFunction() = default; }; + +/// The information necessary to create an outlined function that is matched +/// globally. +struct GlobalOutlinedFunction : public OutlinedFunction { + GlobalOutlinedFunction(OutlinedFunction &OF, unsigned GlobalOccurrenceCount) + : OutlinedFunction(OF.Candidates, OF.SequenceSize, OF.FrameOverhead, + OF.FrameConstructionID), + GlobalOccurrenceCount(GlobalOccurrenceCount) {} + + unsigned GlobalOccurrenceCount; + + /// Return the number of times that appear globally. + /// Global outlining candidate is uniquely created per each match, but this + /// might be erased out when it's overlapped with the previous outlining + /// instance. + unsigned getOccurrenceCount() const override { + assert(Candidates.size() <= 1); + return Candidates.empty() ? 0 : GlobalOccurrenceCount; + } + + /// Return the outlining cost using the global occurrence count + /// with the same cost as the first (unique) candidate. + unsigned getOutliningCost() const override { + assert(Candidates.size() <= 1); + unsigned CallOverhead = + Candidates.empty() + ? 0 + : Candidates[0].getCallOverhead() * getOccurrenceCount(); + return CallOverhead + SequenceSize + FrameOverhead; + } + + GlobalOutlinedFunction() = delete; + ~GlobalOutlinedFunction() = default; +}; + } // namespace outliner } // namespace llvm diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 2c24de60edd43e..145442f0ce3e2c 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -266,6 +266,7 @@ add_llvm_component_library(LLVMCodeGen Analysis BitReader BitWriter + CodeGenData CodeGenTypes Core MC diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index 68a71b80123081..0a8e2850ca9f43 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -59,6 +59,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -66,6 +67,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGenData/CodeGenDataReader.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Mangler.h" @@ -74,6 +76,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/SuffixTree.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include #include #include @@ -121,6 +124,12 @@ static cl::opt OutlinerBenefitThreshold( cl::desc( "The minimum size in bytes before an outlining candidate is accepted")); +static cl::opt + DisableGlobalOutlining("disable-global-outlining", cl::Hidden, + cl::desc("Disable global outlining only by ignoring " + "the codegen data generation or use"), + cl::init(false)); + namespace { /// Maps \p MachineInstrs to unsigned integers and stores the mappings. @@ -411,11 +420,32 @@ struct MachineOutliner : public ModulePass { /// Set when the pass is constructed in TargetPassConfig. bool RunOnAllFunctions = true; + /// This is a compact representation of hash sequences of outlined functions. + /// It is used when OutlinerMode = CGDataMode::Write. + /// The resulting hash tree will be emitted into __llvm_outlined section + /// which will be dead-stripped not going to the final binary. + /// A post-process using llvm-cgdata, lld, or ThinLTO can merge them into + /// a global oulined hash tree for the subsequent codegen. + std::unique_ptr LocalHashTree; + + /// The combined index to check a LTO mode. + const ModuleSummaryIndex *TheIndex = nullptr; + + /// The mode of the outliner. + /// When is's CGDataMode::None, candidates are populated with the suffix tree + /// within a module and outlined. + /// When it's CGDataMode::Write, in addition to CGDataMode::None, the hash + /// sequences of outlined functions are published into LocalHashTree. + /// When it's CGDataMode::Read, candidates are populated with the global + /// outlined hash tree that has been built by the previous codegen. + CGDataMode OutlinerMode = CGDataMode::None; + StringRef getPassName() const override { return "Machine Outliner"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addPreserved(); + AU.addRequired(); AU.setPreservesAll(); ModulePass::getAnalysisUsage(AU); } @@ -450,6 +480,16 @@ struct MachineOutliner : public ModulePass { findCandidates(InstructionMapper &Mapper, std::vector> &FunctionList); + /// Find all repeated substrings that match in the global outlined hash + /// tree built from the previous codegen. + /// + /// \param Mapper Contains outlining mapping information. + /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions + /// each type of candidate. + void findGlobalCandidates( + InstructionMapper &Mapper, + std::vector> &FunctionList); + /// Replace the sequences of instructions represented by \p OutlinedFunctions /// with calls to functions. /// @@ -466,6 +506,12 @@ struct MachineOutliner : public ModulePass { InstructionMapper &Mapper, unsigned Name); + /// Initialize the outliner mode. + void initializeOutlinerMode(const Module &M); + + /// Emit the outlined hash tree into __llvm_outline section. + void emitOutlinedHashTree(Module &M); + /// Calls 'doOutline()' 1 + OutlinerReruns times. bool runOnModule(Module &M) override; @@ -576,6 +622,134 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) { MORE.emit(R); } +struct MatchedEntry { + size_t StartIdx; + size_t Length; + size_t Count; +}; + +static const HashNode *followHashNode(stable_hash StableHash, + const HashNode *Current) { + auto I = Current->Successors.find(StableHash); + return (I == Current->Successors.end()) ? nullptr : I->second.get(); +} + +static std::vector getMatchedEntries(InstructionMapper &Mapper) { + + auto &InstrList = Mapper.InstrList; + auto &UnsignedVec = Mapper.UnsignedVec; + + std::vector MatchedEntries; + std::vector Sequence; + auto Size = UnsignedVec.size(); + + // Get the global outlined hash tree built from the previous run. + assert(cgdata::hasOutlinedHashTree()); + const auto *RootNode = cgdata::getOutlinedHashTree()->getRoot(); + + // Find all matches in the global outlined hash tree. + // It's quadratic complexity in theory, but it's nearly linear in practice + // since the length of outlined candidates are small within a block. + for (size_t I = 0; I < Size; I++) { + if (UnsignedVec[I] >= Size) + continue; + + const MachineInstr &MI = *InstrList[I]; + stable_hash StableHashI = stableHashValue(MI); + if (!StableHashI) + continue; + + Sequence.clear(); + Sequence.push_back(StableHashI); + + const HashNode *LastNode = followHashNode(StableHashI, RootNode); + if (!LastNode) + continue; + + size_t J = I + 1; + for (; J < Size; J++) { + // Break on invalid code + if (UnsignedVec[J] >= Size) + break; + + const MachineInstr &MJ = *InstrList[J]; + stable_hash StableHashJ = stableHashValue(MJ); + // Break on invalid stable hash + if (!StableHashJ) + break; + + LastNode = followHashNode(StableHashJ, LastNode); + if (!LastNode) + break; + + // Even with a match ending with a terminal, we continue finding + // matches to populate all candidates. + Sequence.push_back(StableHashJ); + size_t Count = LastNode->Terminals; + if (Count) + MatchedEntries.push_back({I, J - I + 1, Count}); + } + } + + return MatchedEntries; +} + +static std::vector +stableHashMachineInstrs(const MachineBasicBlock::iterator &Begin, + const MachineBasicBlock::iterator &End) { + std::vector Sequence; + for (auto I = Begin; I != End; I++) { + const MachineInstr &MI = *I; + if (MI.isDebugInstr()) + continue; + stable_hash Hash = stableHashValue(MI); + // if (!Hash) + // continue; + Sequence.push_back(Hash); + } + return Sequence; +} + +// Save hash sequence of candidates for global function outlining. +static void +saveHashSequence(std::vector> &FunctionList) { + for (auto &OF : FunctionList) { + auto &C = OF->Candidates.front(); + OF->OutlinedHashSequence = stableHashMachineInstrs(C.begin(), C.end()); + } +} + +void MachineOutliner::findGlobalCandidates( + InstructionMapper &Mapper, + std::vector> &FunctionList) { + FunctionList.clear(); + auto &InstrList = Mapper.InstrList; + auto &MBBFlagsMap = Mapper.MBBFlagsMap; + + std::vector CandidatesForRepeatedSeq; + for (auto &ME : getMatchedEntries(Mapper)) { + CandidatesForRepeatedSeq.clear(); + MachineBasicBlock::iterator StartIt = InstrList[ME.StartIdx]; + MachineBasicBlock::iterator EndIt = InstrList[ME.StartIdx + ME.Length - 1]; + MachineBasicBlock *MBB = StartIt->getParent(); + Candidate C(ME.StartIdx, ME.Length, StartIt, EndIt, MBB, + FunctionList.size(), MBBFlagsMap[MBB]); + CandidatesForRepeatedSeq.push_back(C); + const TargetInstrInfo *TII = C.getMF()->getSubtarget().getInstrInfo(); + std::optional OF = + TII->getOutliningCandidateInfo(CandidatesForRepeatedSeq, /*MinRep*/ 1); + if (!OF || OF->Candidates.empty()) + continue; + // We create a global candidate each match. + assert(OF->Candidates.size() == 1); + + FunctionList.push_back( + std::make_unique(*OF, ME.Count)); + } + assert(OutlinerMode == CGDataMode::Read); + saveHashSequence(FunctionList); +} + void MachineOutliner::findCandidates( InstructionMapper &Mapper, std::vector> &FunctionList) { @@ -680,6 +854,9 @@ void MachineOutliner::findCandidates( FunctionList.push_back(std::make_unique(*OF)); } + assert(OutlinerMode != CGDataMode::Read); + if (OutlinerMode == CGDataMode::Write) + saveHashSequence(FunctionList); } MachineFunction *MachineOutliner::createOutlinedFunction( @@ -977,6 +1154,10 @@ bool MachineOutliner::outline( // Statistics. NumOutlined++; } + if (OutlinerMode == CGDataMode::Write) { + unsigned Count = OF->Candidates.size(); + LocalHashTree->insert({OF->OutlinedHashSequence, Count}); + } } LLVM_DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";); @@ -1124,12 +1305,65 @@ void MachineOutliner::emitInstrCountChangedRemark( } } +void MachineOutliner::initializeOutlinerMode(const Module &M) { + if (DisableGlobalOutlining) + return; + + if (auto *IndexWrapperPass = + getAnalysisIfAvailable()) + TheIndex = IndexWrapperPass->getIndex(); + + // (Full)LTO module does not have functions added to the index. + // In this case, we run the outliner without using codegen data as usual. + if (TheIndex && !TheIndex->hasExportedFunctions(M)) + return; + + // When codegen data write is enabled, we want to write the local outlined + // hash tree to the custom section, `__llvm_outline`. + // When the outlined hash tree is available from the previous codegen data, + // we want to read it to optimistically create global outlining candidates. + if (cgdata::emitCGData()) { + OutlinerMode = CGDataMode::Write; + // Create a local outlined hash tree to be published. + LocalHashTree.reset(new OutlinedHashTree()); + // We don't need to read the outlined hash tree from the previous codegen + } else if (cgdata::hasOutlinedHashTree()) + OutlinerMode = CGDataMode::Read; +} + +void MachineOutliner::emitOutlinedHashTree(Module &M) { + assert(LocalHashTree); + if (!LocalHashTree->empty()) { + LLVM_DEBUG({ + dbgs() << "Emit outlined hash tree. Size: " << LocalHashTree->size() + << "\n"; + }); + SmallVector Buf; + raw_svector_ostream OS(Buf); + + OutlinedHashTreeRecord HTR(std::move(LocalHashTree)); + HTR.serialize(OS); + + llvm::StringRef Data(Buf.data(), Buf.size()); + std::unique_ptr Buffer = + MemoryBuffer::getMemBuffer(Data, "in-memory outlined hash tree", false); + + Triple TT(M.getTargetTriple()); + embedBufferInModule( + M, *Buffer.get(), + getCodeGenDataSectionName(CG_outline, TT.getObjectFormat())); + } +} + bool MachineOutliner::runOnModule(Module &M) { // Check if there's anything in the module. If it's empty, then there's // nothing to outline. if (M.empty()) return false; + // Initialize the outliner mode. + initializeOutlinerMode(M); + // Number to append to the current outlined function. unsigned OutlinedFunctionNum = 0; @@ -1149,6 +1383,9 @@ bool MachineOutliner::runOnModule(Module &M) { } } + if (OutlinerMode == CGDataMode::Write) + emitOutlinedHashTree(M); + return true; } @@ -1179,7 +1416,10 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) { std::vector> FunctionList; // Find all of the outlining candidates. - findCandidates(Mapper, FunctionList); + if (OutlinerMode == CGDataMode::Read) + findGlobalCandidates(Mapper, FunctionList); + else + findCandidates(Mapper, FunctionList); // If we've requested size remarks, then collect the MI counts of every // function before outlining, and the MI counts after outlining. diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp index 5abfbd5981fba8..33906a3374812b 100644 --- a/llvm/lib/CodeGen/MachineStableHash.cpp +++ b/llvm/lib/CodeGen/MachineStableHash.cpp @@ -94,9 +94,17 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { case MachineOperand::MO_Metadata: StableHashBailingMetadataUnsupported++; return 0; - case MachineOperand::MO_GlobalAddress: - StableHashBailingGlobalAddress++; - return 0; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + if (GV->hasPrivateLinkage() || !GV->hasName()) { + StableHashBailingGlobalAddress++; + return 0; + } + auto Name = GV->getName(); + return stable_hash_combine(MO.getType(), MO.getTargetFlags(), + stable_hash_combine_string(Name), + MO.getOffset()); + } case MachineOperand::MO_TargetIndex: { if (const char *Name = MO.getTargetIndexName()) return stable_hash_combine(MO.getType(), MO.getTargetFlags(), diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CodeGenData/CodeGenData.cpp index 3bd21c97c7de7a..841af8a347eeb2 100644 --- a/llvm/lib/CodeGenData/CodeGenData.cpp +++ b/llvm/lib/CodeGenData/CodeGenData.cpp @@ -24,6 +24,13 @@ using namespace llvm; using namespace cgdata; +cl::opt + CodeGenDataGenerate("codegen-data-generate", cl::init(false), cl::Hidden, + cl::desc("Emit CodeGen Data into custom sections")); +cl::opt + CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden, + cl::desc("File path to where .cgdata file is read")); + static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { std::string Msg; @@ -133,7 +140,24 @@ CodeGenData &CodeGenData::getInstance() { auto *CGD = new CodeGenData(); Instance.reset(CGD); - // TODO: Initialize writer or reader mode for the client optimization. + if (CodeGenDataGenerate) + CGD->EmitCGData = true; + else if (!CodeGenDataUsePath.empty()) { + // Initialize the global CGData if the input file name is given. + // We do not error-out when failing to parse the input file. + // Instead, just emit an warning message and fall back as if no CGData + // were available. + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = CodeGenDataReader::create(CodeGenDataUsePath, *FS); + if (Error E = ReaderOrErr.takeError()) { + warn(std::move(E), CodeGenDataUsePath); + return; + } + // Publish each CGData based on the data type in the header. + auto Reader = ReaderOrErr->get(); + if (Reader->hasOutlinedHashTree()) + CGD->publishOutlinedHashTree(Reader->releaseOutlinedHashTree()); + } }); return *(Instance.get()); } diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index d3c8e3b7e805c1..391d63d2ceeaaf 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -16,6 +16,7 @@ ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: Default Regalloc Eviction Advisor ; CHECK-NEXT: Default Regalloc Priority Advisor +; CHECK-NEXT: Module summary info ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll b/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll new file mode 100644 index 00000000000000..09eb639ab40ef5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll @@ -0,0 +1,40 @@ +; This test verifies the stable hash values for different global variables +; that have distinct names. +; We generate two different cgdata files from nearly identical outline instances, +; with the only difference being the last call target globals, @g vs @h. + +; RUN: split-file %s %t + +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-g.ll -o %t/local-g.o +; RUN: llvm-cgdata merge %t/local-g.o -o %t/local-g.cgdata +; RUN: llvm-cgdata dump %t/local-g.cgdata -o %t/local-g.cgtext +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-h.ll -o %t/local-h.o +; RUN: llvm-cgdata merge %t/local-h.o -o %t/local-h.cgdata +; RUN: llvm-cgdata dump %t/local-h.cgdata -o %t/local-h.cgtext + +; We compare the trees which are only different at the terminal node's hash value. +; Here we simply count the different lines that have `Hash` string. +; RUN: not diff %t/local-g.cgtext %t/local-h.cgtext 2>&1 | grep Hash | wc -l | FileCheck %s +; CHECK: 2 + +;--- local-g.ll +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- local-h.ll +declare i32 @h(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @h(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @h(i32 20, i32 1, i32 2); + ret i32 %1 +} diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll new file mode 100644 index 00000000000000..49f417e9d01294 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll @@ -0,0 +1,57 @@ +; This test demonstrates how identical instruction sequences are handled during global outlining. +; Currently, we do not attempt to share an outlined function for identical sequences. +; Instead, each instruction sequence that matches against the global outlined hash tree +; is outlined into its own unique function. + +; RUN: split-file %s %t + +; First, we generate the cgdata file from a local outline instance present in local-two.ll. +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write +; RUN: llvm-cgdata merge %t_write -o %t_cgdata +; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW + +; SHOW: Outlined hash tree: +; SHOW-NEXT: Total Node Count: 4 +; SHOW-NEXT: Terminal Node Count: 1 +; SHOW-NEXT: Depth: 3 + +; Now, we read the cgdata for local-two-another.ll and proceed to optimistically outline +; each instruction sequence that matches against the global outlined hash tree. +; Since each matching sequence is considered a candidate, we expect to generate two +; unique outlined functions. These functions, although unique, will be identical in code, +; and thus, will be folded by the linker. + +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-two-another.ll -o %t_read +; RUN: llvm-objdump -d %t_read | FileCheck %s + +; CHECK: _OUTLINED_FUNCTION_{{.*}}: +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: b + +; CHECK: _OUTLINED_FUNCTION_{{.*}}: +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: b + +;--- local-two.ll +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- local-two-another.ll +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} +define i32 @f4() minsize { + %1 = call i32 @g(i32 40, i32 1, i32 2); + ret i32 %1 +} diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll new file mode 100644 index 00000000000000..e7b49f422f118a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll @@ -0,0 +1,94 @@ +; This test is similar to cgdata-read-double-outline.ll, but it is executed with LTO (Link Time Optimization). +; It demonstrates how identical instruction sequences are handled during global outlining. +; Currently, we do not attempt to reuse an outlined function for identical sequences. +; Instead, each instruction sequence that appears in the global outlined hash tree +; is outlined into its own unique function. + +; RUN: split-file %s %t + +; We first create the cgdata file from a local outline instance in local-two.ll +; RUN: opt -module-summary %t/local-two.ll -o %t/write.bc +; RUN: llvm-lto2 run %t/write.bc -o %t/write \ +; RUN: -r %t/write.bc,_f1,px -r %t/write.bc,_f2,px -r %t/write.bc,_g,p \ +; RUN: -codegen-data-generate=true +; RUN: llvm-cgdata merge %t/write.1 -o %t_cgdata +; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW + +; SHOW: Outlined hash tree: +; SHOW-NEXT: Total Node Count: 4 +; SHOW-NEXT: Terminal Node Count: 1 +; SHOW-NEXT: Depth: 3 + +; Now, we execute either ThinLTO or LTO by reading the cgdata for local-two-another.ll. +; With ThinLTO, similar to the no-LTO scenario shown in cgdata-read-double-outline.ll, +; it optimistically outlines each instruction sequence that matches against +; the global outlined hash tree. Since each matching sequence is considered a candidate, +; we expect to generate two unique outlined functions that will be folded +; by the linker at a later stage. +; However, with LTO, we do not utilize the cgdata, but instead fall back to the default +; outliner mode. This results in a single outlined function that is +; shared across two call-sites. + +; Run ThinLTO +; RUN: opt -module-summary %t/local-two-another.ll -o %t/thinlto.bc +; RUN: llvm-lto2 run %t/thinlto.bc -o %t/thinlto \ +; RUN: -r %t/thinlto.bc,_f3,px -r %t/thinlto.bc,_f4,px -r %t/thinlto.bc,_g,p \ +; RUN: -codegen-data-use-path=%t_cgdata +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s + +; CHECK: _OUTLINED_FUNCTION_{{.*}}: +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: b +; CHECK: _OUTLINED_FUNCTION_{{.*}}: +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: b + +; Run ThinLTO while disabling the global outliner. +; We have a single outlined case with the default outliner. +; RUN: llvm-lto2 run %t/thinlto.bc -o %t/thinlto-disable \ +; RUN: -r %t/thinlto.bc,_f3,px -r %t/thinlto.bc,_f4,px -r %t/thinlto.bc,_g,p \ +; RUN: -codegen-data-use-path=%t_cgdata \ +; RUN: -disable-global-outlining +; RUN: llvm-objdump -d %t/thinlto-disable.1 | FileCheck %s --check-prefix=DISABLE + +; DISABLE: _OUTLINED_FUNCTION_{{.*}}: +; DISABLE-NEXT: mov +; DISABLE-NEXT: mov +; DISABLE-NEXT: b +; DISABLE-NOT: _OUTLINED_FUNCTION_{{.*}}: + +; Run LTO, which effectively disables the global outliner. +; RUN: opt %t/local-two-another.ll -o %t/lto.bc +; RUN: llvm-lto2 run %t/lto.bc -o %t/lto \ +; RUN: -r %t/lto.bc,_f3,px -r %t/lto.bc,_f4,px -r %t/lto.bc,_g,p \ +; RUN: -codegen-data-use-path=%t_cgdata +; RUN: llvm-objdump -d %t/lto.0 | FileCheck %s --check-prefix=DISABLE + +;--- local-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- local-two-another.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} +define i32 @f4() minsize { + %1 = call i32 @g(i32 40, i32 1, i32 2); + ret i32 %1 +} diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll b/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll new file mode 100644 index 00000000000000..642beb46915d73 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll @@ -0,0 +1,68 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; using codegen data that has been read from a previous codegen run. +; When multiple matches occur, we prioritize the candidates using the global frequency. + +; RUN: split-file %s %t + +; First, we generate the cgdata file from local outline instances present in write1.ll and write2.ll +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/write1.ll -o %t_write1 +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/write2.ll -o %t_write2 +; RUN: llvm-cgdata merge %t_write1 %t_write2 -o %t_cgdata +; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW + +; SHOW: Outlined hash tree: +; SHOW-NEXT: Total Node Count: 8 +; SHOW-NEXT: Terminal Node Count: 2 +; SHOW-NEXT: Depth: 4 + +; Now, we read the cgdata in the machine outliner, enabling us to optimistically +; outline a singleton instance in read.ll that matches against the cgdata. +; There are two matches -- (1) (mov #1, mov #2, mov #3, b) and (2) (mov #2, mov #3, b). +; Even though sequence (1) is longer than sequence (2), the latter is outlined because it occurs more frequently in the outlined hash tree. + +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/read.ll -o %t_read +; RUN: llvm-objdump -d %t_read | FileCheck %s + +; CHECK: _OUTLINED_FUNCTION +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: b + +;--- write1.ll +; The sequence (mov #2, mov #3, b) are repeated 4 times. +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 50, i32 2, i32 3); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 60, i32 2, i32 3); + ret i32 %1 +} +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 70, i32 2, i32 3); + ret i32 %1 +} +define i32 @f4() minsize { + %1 = call i32 @g(i32 40, i32 80, i32 2, i32 3); + ret i32 %1 +} + +;--- write2.ll +; The sequence (mov #1, mov #2, mov #3, b) are repeated 2 times. +declare i32 @g(i32, i32, i32) +define i32 @f6() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2, i32 3); + ret i32 %1 +} +define i32 @f7() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2, i32 3); + ret i32 %1 +} + +;--- read.ll +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2, i32 3); + ret i32 %1 +} diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll new file mode 100644 index 00000000000000..2c606a5a13007e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll @@ -0,0 +1,42 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; using codegen data that has been read from a previous codegen run. + +; RUN: split-file %s %t + +; First, we generate the cgdata file from a local outline instance present in local-two.ll. +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write +; RUN: llvm-cgdata merge %t_write -o %t_cgdata +; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW + +; SHOW: Outlined hash tree: +; SHOW-NEXT: Total Node Count: 4 +; SHOW-NEXT: Terminal Node Count: 1 +; SHOW-NEXT: Depth: 3 + +; Now, we read the cgdata in the machine outliner, enabling us to optimistically +; outline a singleton instance in local-one.ll that matches against the cgdata. +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one.ll -o %t_read +; RUN: llvm-objdump -d %t_read | FileCheck %s + +; CHECK: _OUTLINED_FUNCTION +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: b + +;--- local-two.ll +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- local-one.ll +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} diff --git a/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll new file mode 100644 index 00000000000000..0527ec1434ba09 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll @@ -0,0 +1,51 @@ +; This test verifies whether an outlined function is encoded into the __llvm_outline section +; when the -codegen-data-generate flag is used. + +; Verify whether an outlined function is always created, but only encoded into the section when the flag is used. +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %s -o %t_save +; RUN: llvm-objdump -d %t_save | FileCheck %s +; RUN: llvm-objdump -h %t_save | FileCheck %s --check-prefix=SECTNAME +; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=false -filetype=obj %s -o %t_nosave +; RUN: llvm-objdump -d %t_nosave | FileCheck %s +; RUN: llvm-objdump -h %t_nosave | FileCheck %s --check-prefix=NOSECTNAME + +; CHECK: _OUTLINED_FUNCTION +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: b +; SECTNAME: __llvm_outline +; NOSECTNAME-NOT: __llvm_outline + +; Verify the content of cgdata after it has been processed with llvm-cgdata. +; RUN: llvm-cgdata merge %t_save -o %t_cgdata +; RUN: llvm-cgdata dump %t_cgdata | FileCheck %s --check-prefix=TREE + +; TREE: :outlined_hash_tree +; TREE: --- +; TREE-NEXT: 0: +; TREE-NEXT: Hash: 0x0 +; TREE-NEXT: Terminals: 0 +; TREE-NEXT: SuccessorIds: [ 1 ] +; TREE-NEXT: 1: +; TREE-NEXT: Hash: {{.}} +; TREE-NEXT: Terminals: 0 +; TREE-NEXT: SuccessorIds: [ 2 ] +; TREE-NEXT: 2: +; TREE-NEXT: Hash: {{.}} +; TREE-NEXT: Terminals: 0 +; TREE-NEXT: SuccessorIds: [ 3 ] +; TREE-NEXT: 3: +; TREE-NEXT: Hash: {{.}} +; TREE-NEXT: Terminals: 2 +; TREE-NEXT: SuccessorIds: [ ] +; TREE-NEXT: ... + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 90472f246918f3..f02425d53b3ee9 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -20,6 +20,7 @@ ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: Default Regalloc Eviction Advisor ; CHECK-NEXT: Default Regalloc Priority Advisor +; CHECK-NEXT: Module summary info ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager