Skip to content

Commit

Permalink
[MemProf] Track and report profiled sizes through cloning (#98382)
Browse files Browse the repository at this point in the history
If requested, via the -memprof-report-hinted-sizes option, track the
total profiled size of each MIB through the thin link, then report on
the corresponding allocation coldness after all cloning is complete.

To save size, a different bitcode record type is used for the allocation
info when the option is specified, and the sizes are kept separate from
the MIBs in the index.
  • Loading branch information
teresajohnson authored Jul 11, 2024
1 parent 1cafde2 commit 9f8205d
Show file tree
Hide file tree
Showing 10 changed files with 161 additions and 34 deletions.
5 changes: 3 additions & 2 deletions llvm/include/llvm/Bitcode/LLVMBitCodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,8 @@ enum GlobalValueSummarySymtabCodes {
// [valueid, n x stackidindex]
FS_PERMODULE_CALLSITE_INFO = 26,
// Summary of per-module allocation memprof metadata.
// [n x (alloc type, nummib, nummib x stackidindex)]
// [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex),
// [nummib x total size]?]
FS_PERMODULE_ALLOC_INFO = 27,
// Summary of combined index memprof callsite metadata.
// [valueid, numstackindices, numver,
Expand All @@ -316,7 +317,7 @@ enum GlobalValueSummarySymtabCodes {
// Summary of combined index allocation memprof metadata.
// [nummib, numver,
// nummib x (alloc type, numstackids, numstackids x stackidindex),
// numver x version]
// numver x version, [nummib x total size]?]
FS_COMBINED_ALLOC_INFO = 29,
FS_STACK_IDS = 30,
};
Expand Down
16 changes: 15 additions & 1 deletion llvm/include/llvm/IR/ModuleSummaryIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,10 @@ struct AllocInfo {
// Vector of MIBs in this memprof metadata.
std::vector<MIBInfo> MIBs;

// If requested, keep track of total profiled sizes for each MIB. This will be
// a vector of the same length and order as the MIBs vector, if non-empty.
std::vector<uint64_t> TotalSizes;

AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
Versions.push_back(0);
}
Expand All @@ -423,6 +427,16 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) {
for (auto &M : AE.MIBs) {
OS << "\t\t" << M << "\n";
}
if (!AE.TotalSizes.empty()) {
OS << " TotalSizes per MIB:\n\t\t";
First = true;
for (uint64_t TS : AE.TotalSizes) {
if (!First)
OS << ", ";
First = false;
OS << TS << "\n";
}
}
return OS;
}

Expand Down Expand Up @@ -1431,7 +1445,7 @@ class ModuleSummaryIndex {
// in the way some record are interpreted, like flags for instance.
// Note that incrementing this may require changes in both BitcodeReader.cpp
// and BitcodeWriter.cpp.
static constexpr uint64_t BitcodeSummaryVersion = 9;
static constexpr uint64_t BitcodeSummaryVersion = 10;

// Regular LTO module name for ASM writer
static constexpr const char *getRegularLTOModuleName() {
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;

extern cl::opt<unsigned> MaxNumVTableAnnotations;

extern cl::opt<bool> MemProfReportHintedSizes;

// Walk through the operands of a given User via worklist iteration and populate
// the set of GlobalValue references encountered. Invoked either on an
// Instruction or a GlobalVariable (which walks its initializer).
Expand Down Expand Up @@ -517,6 +519,7 @@ static void computeFunctionSummary(
auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
if (MemProfMD) {
std::vector<MIBInfo> MIBs;
std::vector<uint64_t> TotalSizes;
for (auto &MDOp : MemProfMD->operands()) {
auto *MIBMD = cast<const MDNode>(MDOp);
MDNode *StackNode = getMIBStackNode(MIBMD);
Expand All @@ -536,8 +539,17 @@ static void computeFunctionSummary(
}
MIBs.push_back(
MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices)));
if (MemProfReportHintedSizes) {
auto TotalSize = getMIBTotalSize(MIBMD);
assert(TotalSize);
TotalSizes.push_back(TotalSize);
}
}
Allocs.push_back(AllocInfo(std::move(MIBs)));
if (MemProfReportHintedSizes) {
assert(Allocs.back().MIBs.size() == TotalSizes.size());
Allocs.back().TotalSizes = std::move(TotalSizes);
}
} else if (!InstCallsite.empty()) {
SmallVector<unsigned> StackIdIndices;
for (auto StackId : InstCallsite)
Expand Down
32 changes: 31 additions & 1 deletion llvm/lib/Bitcode/Reader/BitcodeReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7994,7 +7994,12 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
case bitc::FS_PERMODULE_ALLOC_INFO: {
unsigned I = 0;
std::vector<MIBInfo> MIBs;
while (I < Record.size()) {
unsigned NumMIBs = 0;
if (Version >= 10)
NumMIBs = Record[I++];
unsigned MIBsRead = 0;
while ((Version >= 10 && MIBsRead++ < NumMIBs) ||
(Version < 10 && I < Record.size())) {
assert(Record.size() - I >= 2);
AllocationType AllocType = (AllocationType)Record[I++];
unsigned NumStackEntries = Record[I++];
Expand All @@ -8007,7 +8012,19 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
}
MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
}
std::vector<uint64_t> TotalSizes;
// We either have no sizes or NumMIBs of them.
assert(I == Record.size() || Record.size() - I == NumMIBs);
if (I < Record.size()) {
MIBsRead = 0;
while (MIBsRead++ < NumMIBs)
TotalSizes.push_back(Record[I++]);
}
PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
if (!TotalSizes.empty()) {
assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
PendingAllocs.back().TotalSizes = std::move(TotalSizes);
}
break;
}

Expand All @@ -8034,8 +8051,21 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
SmallVector<uint8_t> Versions;
for (unsigned J = 0; J < NumVersions; J++)
Versions.push_back(Record[I++]);
std::vector<uint64_t> TotalSizes;
// We either have no sizes or NumMIBs of them.
assert(I == Record.size() || Record.size() - I == NumMIBs);
if (I < Record.size()) {
MIBsRead = 0;
while (MIBsRead++ < NumMIBs) {
TotalSizes.push_back(Record[I++]);
}
}
PendingAllocs.push_back(
AllocInfo(std::move(Versions), std::move(MIBs)));
if (!TotalSizes.empty()) {
assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
PendingAllocs.back().TotalSizes = std::move(TotalSizes);
}
break;
}
}
Expand Down
19 changes: 14 additions & 5 deletions llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4189,10 +4189,9 @@ static void writeFunctionHeapProfileRecords(
// Per module alloc versions should always have a single entry of
// value 0.
assert(!PerModule || (AI.Versions.size() == 1 && AI.Versions[0] == 0));
if (!PerModule) {
Record.push_back(AI.MIBs.size());
Record.push_back(AI.MIBs.size());
if (!PerModule)
Record.push_back(AI.Versions.size());
}
for (auto &MIB : AI.MIBs) {
Record.push_back((uint8_t)MIB.AllocType);
Record.push_back(MIB.StackIdIndices.size());
Expand All @@ -4203,6 +4202,11 @@ static void writeFunctionHeapProfileRecords(
for (auto V : AI.Versions)
Record.push_back(V);
}
assert(AI.TotalSizes.empty() || AI.TotalSizes.size() == AI.MIBs.size());
if (!AI.TotalSizes.empty()) {
for (auto Size : AI.TotalSizes)
Record.push_back(Size);
}
Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
: bitc::FS_COMBINED_ALLOC_INFO,
Record, AllocAbbrev);
Expand Down Expand Up @@ -4432,7 +4436,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {

Abbv = std::make_shared<BitCodeAbbrev>();
Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
// n x (alloc type, numstackids, numstackids x stackidindex)
// optional: nummib x total size
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
Expand Down Expand Up @@ -4576,6 +4582,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
// nummib x (alloc type, numstackids, numstackids x stackidindex),
// numver x version
// optional: nummib x total size
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
Expand Down Expand Up @@ -4675,7 +4682,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
writeFunctionHeapProfileRecords(
Stream, FS, CallsiteAbbrev, AllocAbbrev,
/*PerModule*/ false,
/*GetValueId*/ [&](const ValueInfo &VI) -> unsigned {
/*GetValueId*/
[&](const ValueInfo &VI) -> unsigned {
std::optional<unsigned> ValueID = GetValueId(VI);
// This can happen in shared index files for distributed ThinLTO if
// the callee function summary is not included. Record 0 which we
Expand All @@ -4685,7 +4693,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
return 0;
return *ValueID;
},
/*GetStackIndex*/ [&](unsigned I) {
/*GetStackIndex*/
[&](unsigned I) {
// Get the corresponding index into the list of StackIds actually
// being written for this combined index (which may be a subset in
// the case of distributed indexes).
Expand Down
83 changes: 68 additions & 15 deletions llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ cl::opt<bool> SupportsHotColdNew(
cl::desc("Linking with hot/cold operator new interfaces"));
} // namespace llvm

extern cl::opt<bool> MemProfReportHintedSizes;

namespace {
/// CRTP base for graphs built from either IR or ThinLTO summary index.
///
Expand Down Expand Up @@ -172,6 +174,7 @@ class CallsiteContextGraph {

void dump() const;
void print(raw_ostream &OS) const;
void printTotalSizes(raw_ostream &OS) const;

friend raw_ostream &operator<<(raw_ostream &OS,
const CallsiteContextGraph &CCG) {
Expand Down Expand Up @@ -439,7 +442,7 @@ class CallsiteContextGraph {
void addStackNodesForMIB(ContextNode *AllocNode,
CallStack<NodeT, IteratorT> &StackContext,
CallStack<NodeT, IteratorT> &CallsiteContext,
AllocationType AllocType);
AllocationType AllocType, uint64_t TotalSize);

/// Matches all callsite metadata (or summary) to the nodes created for
/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
Expand Down Expand Up @@ -611,6 +614,10 @@ class CallsiteContextGraph {
/// Map from each context ID to the AllocationType assigned to that context.
DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;

/// Map from each contextID to the profiled aggregate allocation size,
/// optionally populated when requested (via MemProfReportHintedSizes).
DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;

/// Identifies the context node created for a stack id when adding the MIB
/// contexts to the graph. This is used to locate the context nodes when
/// trying to assign the corresponding callsites with those stack ids to these
Expand Down Expand Up @@ -1004,18 +1011,36 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
return AllocNode;
}

static std::string getAllocTypeString(uint8_t AllocTypes) {
if (!AllocTypes)
return "None";
std::string Str;
if (AllocTypes & (uint8_t)AllocationType::NotCold)
Str += "NotCold";
if (AllocTypes & (uint8_t)AllocationType::Cold)
Str += "Cold";
return Str;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
template <class NodeT, class IteratorT>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
uint64_t TotalSize) {
assert(!MemProfReportHintedSizes || TotalSize > 0);
// Treating the hot alloc type as NotCold before the disambiguation for "hot"
// is done.
if (AllocType == AllocationType::Hot)
AllocType = AllocationType::NotCold;

ContextIdToAllocationType[++LastContextId] = AllocType;

if (MemProfReportHintedSizes) {
assert(TotalSize);
ContextIdToTotalSize[LastContextId] = TotalSize;
}

// Update alloc type and context ids for this MIB.
AllocNode->AllocTypes |= (uint8_t)AllocType;

Expand Down Expand Up @@ -1060,6 +1085,10 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
assert(ContextIdToAllocationType.count(OldId));
// The new context has the same allocation type as original.
ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
// For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
// up the size. Assume that if we are able to duplicate context ids that we
// will be able to disambiguate all copies.
ContextIdToTotalSize[LastContextId] = 0;
}
return NewContextIds;
}
Expand Down Expand Up @@ -1663,7 +1692,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
addStackNodesForMIB<MDNode, MDNode::op_iterator>(
AllocNode, StackContext, CallsiteContext,
getMIBAllocType(MIBMD));
getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
}
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
// Memprof and callsite metadata on memory allocations no longer
Expand Down Expand Up @@ -1735,12 +1764,20 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
// stack ids on the allocation call during ModuleSummaryAnalysis.
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
EmptyContext;
unsigned I = 0;
assert(!MemProfReportHintedSizes ||
AN.TotalSizes.size() == AN.MIBs.size());
// Now add all of the MIBs and their stack nodes.
for (auto &MIB : AN.MIBs) {
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
StackContext(&MIB);
uint64_t TotalSize = 0;
if (MemProfReportHintedSizes)
TotalSize = AN.TotalSizes[I];
addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
AllocNode, StackContext, EmptyContext, MIB.AllocType);
AllocNode, StackContext, EmptyContext, MIB.AllocType,
TotalSize);
I++;
}
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
// Initialize version 0 on the summary alloc node to the current alloc
Expand Down Expand Up @@ -2171,17 +2208,6 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc(
return true;
}

static std::string getAllocTypeString(uint8_t AllocTypes) {
if (!AllocTypes)
return "None";
std::string Str;
if (AllocTypes & (uint8_t)AllocationType::NotCold)
Str += "NotCold";
if (AllocTypes & (uint8_t)AllocationType::Cold)
Str += "Cold";
return Str;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
const {
Expand Down Expand Up @@ -2261,6 +2287,30 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
}
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
raw_ostream &OS) const {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
for (const auto Node : nodes<GraphType>(this)) {
if (Node->isRemoved())
continue;
if (!Node->IsAllocation)
continue;
DenseSet<uint32_t> ContextIds = Node->getContextIds();
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds) {
auto SizeI = ContextIdToTotalSize.find(Id);
assert(SizeI != ContextIdToTotalSize.end());
auto TypeI = ContextIdToAllocationType.find(Id);
assert(TypeI != ContextIdToAllocationType.end());
OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
<< " with total size " << SizeI->second << " is "
<< getAllocTypeString(Node->AllocTypes) << " after cloning\n";
}
}
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
Expand Down Expand Up @@ -3797,6 +3847,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
if (ExportToDot)
exportToDot("clonefuncassign");

if (MemProfReportHintedSizes)
printTotalSizes(errs());

return Changed;
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Bitcode/summary_version.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: opt -module-summary %s -o - | llvm-bcanalyzer -dump | FileCheck %s

; CHECK: <GLOBALVAL_SUMMARY_BLOCK
; CHECK: <VERSION op0=9/>
; CHECK: <VERSION op0=10/>



Expand Down
Loading

0 comments on commit 9f8205d

Please sign in to comment.