diff --git a/cmake/Modules/FindLLVM.cmake b/cmake/Modules/FindLLVM.cmake index 9dfc883a31c..3e15fa9a81a 100644 --- a/cmake/Modules/FindLLVM.cmake +++ b/cmake/Modules/FindLLVM.cmake @@ -32,7 +32,8 @@ # We also want an user-specified LLVM_ROOT_DIR to take precedence over the # system default locations such as /usr/local/bin. Executing find_program() # multiples times is the approach recommended in the docs. -set(llvm_config_names llvm-config-18.1 llvm-config181 llvm-config-18 +set(llvm_config_names llvm-config-19.1 llvm-config191 llvm-config-19 + llvm-config-18.1 llvm-config181 llvm-config-18 llvm-config-17.0 llvm-config170 llvm-config-17 llvm-config-16.0 llvm-config160 llvm-config-16 llvm-config-15.0 llvm-config150 llvm-config-15 @@ -46,9 +47,11 @@ if(APPLE) # extra fallbacks for MacPorts & Homebrew find_program(LLVM_CONFIG NAMES ${llvm_config_names} - PATHS /opt/local/libexec/llvm-18/bin /opt/local/libexec/llvm-17/bin + PATHS /opt/local/libexec/llvm-19/bin + /opt/local/libexec/llvm-18/bin /opt/local/libexec/llvm-17/bin /opt/local/libexec/llvm-16/bin /opt/local/libexec/llvm-15/bin /opt/local/libexec/llvm/bin + /usr/local/opt/llvm@19/bin /usr/local/opt/llvm@18/bin /usr/local/opt/llvm@17/bin /usr/local/opt/llvm@16/bin /usr/local/opt/llvm@15/bin /usr/local/opt/llvm/bin @@ -138,11 +141,6 @@ else() string(REPLACE "-llibxml2.tbd" "-lxml2" LLVM_LDFLAGS ${LLVM_LDFLAGS}) endif() - if(${LLVM_VERSION_MAJOR} LESS "15") - # Versions below 15.0 do not support component windowsdriver - list(REMOVE_ITEM LLVM_FIND_COMPONENTS "windowsdriver") - endif() - llvm_set(LIBRARY_DIRS libdir true) llvm_set_libs(LIBRARIES libs "${LLVM_FIND_COMPONENTS}") # LLVM bug: llvm-config --libs tablegen returns -lLLVM-3.8.0 diff --git a/tests/lit.site.cfg.in b/tests/lit.site.cfg.in index 537dbf78fec..ed6ccec5e07 100644 --- a/tests/lit.site.cfg.in +++ b/tests/lit.site.cfg.in @@ -86,7 +86,7 @@ config.available_features.add("llvm%d" % config.llvm_version) # config.llvm_version: 309, 400, 500, ... # plusoneable_llvmversion: 39, 40, 50, ... plusoneable_llvmversion = config.llvm_version // 10 + config.llvm_version%10 -for version in range(140, plusoneable_llvmversion+1): +for version in range(150, plusoneable_llvmversion+1): config.available_features.add("atleast_llvm%d0%d" % (version//10, version%10)) for version in range(plusoneable_llvmversion, 201): config.available_features.add("atmost_llvm%d0%d" % (version//10, version%10)) diff --git a/tools/ldc-profdata/llvm-profdata-19.1.cpp b/tools/ldc-profdata/llvm-profdata-19.1.cpp new file mode 100644 index 00000000000..d2d2f945822 --- /dev/null +++ b/tools/ldc-profdata/llvm-profdata-19.1.cpp @@ -0,0 +1,3383 @@ +//===- llvm-profdata.cpp - LLVM profile data tool -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// llvm-profdata merges .profdata files. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Object/Binary.h" +#include "llvm/ProfileData/InstrProfCorrelator.h" +#include "llvm/ProfileData/InstrProfReader.h" +#include "llvm/ProfileData/InstrProfWriter.h" +#include "llvm/ProfileData/MemProf.h" +#include "llvm/ProfileData/MemProfReader.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include "llvm/ProfileData/SampleProfReader.h" +#include "llvm/ProfileData/SampleProfWriter.h" +#include "llvm/Support/BalancedPartitioning.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Discriminator.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/LLVMDriver.h" +#include "llvm/Support/MD5.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/ThreadPool.h" +#include "llvm/Support/Threading.h" +#include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/WithColor.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include + +using namespace llvm; +using ProfCorrelatorKind = InstrProfCorrelator::ProfCorrelatorKind; + +// https://llvm.org/docs/CommandGuide/llvm-profdata.html has documentations +// on each subcommand. +cl::SubCommand ShowSubcommand( + "show", + "Takes a profile data file and displays the profiles. See detailed " + "documentation in " + "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-show"); +cl::SubCommand OrderSubcommand( + "order", + "Reads temporal profiling traces from a profile and outputs a function " + "order that reduces the number of page faults for those traces. See " + "detailed documentation in " + "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-order"); +cl::SubCommand OverlapSubcommand( + "overlap", + "Computes and displays the overlap between two profiles. See detailed " + "documentation in " + "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-overlap"); +cl::SubCommand MergeSubcommand( + "merge", + "Takes several profiles and merge them together. See detailed " + "documentation in " + "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-merge"); + +namespace { +enum ProfileKinds { instr, sample, memory }; +enum FailureMode { warnOnly, failIfAnyAreInvalid, failIfAllAreInvalid }; + +enum ProfileFormat { + PF_None = 0, + PF_Text, + PF_Compact_Binary, // Deprecated + PF_Ext_Binary, + PF_GCC, + PF_Binary +}; + +enum class ShowFormat { Text, Json, Yaml }; +} // namespace + +// Common options. +cl::opt OutputFilename("output", cl::value_desc("output"), + cl::init("-"), cl::desc("Output file"), + cl::sub(ShowSubcommand), + cl::sub(OrderSubcommand), + cl::sub(OverlapSubcommand), + cl::sub(MergeSubcommand)); +// NOTE: cl::alias must not have cl::sub(), since aliased option's cl::sub() +// will be used. llvm::cl::alias::done() method asserts this condition. +cl::alias OutputFilenameA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + +// Options common to at least two commands. +cl::opt ProfileKind( + cl::desc("Profile kind:"), cl::sub(MergeSubcommand), + cl::sub(OverlapSubcommand), cl::init(instr), + cl::values(clEnumVal(instr, "Instrumentation profile (default)"), + clEnumVal(sample, "Sample profile"))); +cl::opt Filename(cl::Positional, cl::desc(""), + cl::sub(ShowSubcommand), + cl::sub(OrderSubcommand)); +cl::opt MaxDbgCorrelationWarnings( + "max-debug-info-correlation-warnings", + cl::desc("The maximum number of warnings to emit when correlating " + "profile from debug info (0 = no limit)"), + cl::sub(MergeSubcommand), cl::sub(ShowSubcommand), cl::init(5)); +cl::opt ProfiledBinary( + "profiled-binary", cl::init(""), + cl::desc("Path to binary from which the profile was collected."), + cl::sub(ShowSubcommand), cl::sub(MergeSubcommand)); +cl::opt DebugInfoFilename( + "debug-info", cl::init(""), + cl::desc( + "For show, read and extract profile metadata from debug info and show " + "the functions it found. For merge, use the provided debug info to " + "correlate the raw profile."), + cl::sub(ShowSubcommand), cl::sub(MergeSubcommand)); +cl::opt + BinaryFilename("binary-file", cl::init(""), + cl::desc("For merge, use the provided unstripped bianry to " + "correlate the raw profile."), + cl::sub(MergeSubcommand)); +cl::opt FuncNameFilter( + "function", + cl::desc("Only functions matching the filter are shown in the output. For " + "overlapping CSSPGO, this takes a function name with calling " + "context."), + cl::sub(ShowSubcommand), cl::sub(OverlapSubcommand), + cl::sub(MergeSubcommand)); + +// TODO: Consider creating a template class (e.g., MergeOption, ShowOption) to +// factor out the common cl::sub in cl::opt constructor for subcommand-specific +// options. + +// Options specific to merge subcommand. +cl::list InputFilenames(cl::Positional, cl::sub(MergeSubcommand), + cl::desc("")); +cl::list WeightedInputFilenames("weighted-input", + cl::sub(MergeSubcommand), + cl::desc(",")); +cl::opt OutputFormat( + cl::desc("Format of output profile"), cl::sub(MergeSubcommand), + cl::init(PF_Ext_Binary), + cl::values(clEnumValN(PF_Binary, "binary", "Binary encoding"), + clEnumValN(PF_Ext_Binary, "extbinary", + "Extensible binary encoding " + "(default)"), + clEnumValN(PF_Text, "text", "Text encoding"), + clEnumValN(PF_GCC, "gcc", + "GCC encoding (only meaningful for -sample)"))); +cl::opt + InputFilenamesFile("input-files", cl::init(""), cl::sub(MergeSubcommand), + cl::desc("Path to file containing newline-separated " + "[,] entries")); +cl::alias InputFilenamesFileA("f", cl::desc("Alias for --input-files"), + cl::aliasopt(InputFilenamesFile)); +cl::opt DumpInputFileList( + "dump-input-file-list", cl::init(false), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("Dump the list of input files and their weights, then exit")); +cl::opt RemappingFile("remapping-file", cl::value_desc("file"), + cl::sub(MergeSubcommand), + cl::desc("Symbol remapping file")); +cl::alias RemappingFileA("r", cl::desc("Alias for --remapping-file"), + cl::aliasopt(RemappingFile)); +cl::opt + UseMD5("use-md5", cl::init(false), cl::Hidden, + cl::desc("Choose to use MD5 to represent string in name table (only " + "meaningful for -extbinary)"), + cl::sub(MergeSubcommand)); +cl::opt CompressAllSections( + "compress-all-sections", cl::init(false), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("Compress all sections when writing the profile (only " + "meaningful for -extbinary)")); +cl::opt SampleMergeColdContext( + "sample-merge-cold-context", cl::init(false), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc( + "Merge context sample profiles whose count is below cold threshold")); +cl::opt SampleTrimColdContext( + "sample-trim-cold-context", cl::init(false), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc( + "Trim context sample profiles whose count is below cold threshold")); +cl::opt SampleColdContextFrameDepth( + "sample-frame-depth-for-cold-context", cl::init(1), + cl::sub(MergeSubcommand), + cl::desc("Keep the last K frames while merging cold profile. 1 means the " + "context-less base profile")); +cl::opt OutputSizeLimit( + "output-size-limit", cl::init(0), cl::Hidden, cl::sub(MergeSubcommand), + cl::desc("Trim cold functions until profile size is below specified " + "limit in bytes. This uses a heursitic and functions may be " + "excessively trimmed")); +cl::opt GenPartialProfile( + "gen-partial-profile", cl::init(false), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("Generate a partial profile (only meaningful for -extbinary)")); +cl::opt SupplInstrWithSample( + "supplement-instr-with-sample", cl::init(""), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("Supplement an instr profile with sample profile, to correct " + "the profile unrepresentativeness issue. The sample " + "profile is the input of the flag. Output will be in instr " + "format (The flag only works with -instr)")); +cl::opt ZeroCounterThreshold( + "zero-counter-threshold", cl::init(0.7), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("For the function which is cold in instr profile but hot in " + "sample profile, if the ratio of the number of zero counters " + "divided by the total number of counters is above the " + "threshold, the profile of the function will be regarded as " + "being harmful for performance and will be dropped.")); +cl::opt SupplMinSizeThreshold( + "suppl-min-size-threshold", cl::init(10), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("If the size of a function is smaller than the threshold, " + "assume it can be inlined by PGO early inliner and it won't " + "be adjusted based on sample profile.")); +cl::opt InstrProfColdThreshold( + "instr-prof-cold-threshold", cl::init(0), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("User specified cold threshold for instr profile which will " + "override the cold threshold got from profile summary. ")); +// WARNING: This reservoir size value is propagated to any input indexed +// profiles for simplicity. Changing this value between invocations could +// result in sample bias. +cl::opt TemporalProfTraceReservoirSize( + "temporal-profile-trace-reservoir-size", cl::init(100), + cl::sub(MergeSubcommand), + cl::desc("The maximum number of stored temporal profile traces (default: " + "100)")); +cl::opt TemporalProfMaxTraceLength( + "temporal-profile-max-trace-length", cl::init(10000), + cl::sub(MergeSubcommand), + cl::desc("The maximum length of a single temporal profile trace " + "(default: 10000)")); +cl::opt FuncNameNegativeFilter( + "no-function", cl::init(""), + cl::sub(MergeSubcommand), + cl::desc("Exclude functions matching the filter from the output.")); + +cl::opt + FailMode("failure-mode", cl::init(failIfAnyAreInvalid), + cl::desc("Failure mode:"), cl::sub(MergeSubcommand), + cl::values(clEnumValN(warnOnly, "warn", + "Do not fail and just print warnings."), + clEnumValN(failIfAnyAreInvalid, "any", + "Fail if any profile is invalid."), + clEnumValN(failIfAllAreInvalid, "all", + "Fail only if all profiles are invalid."))); + +cl::opt OutputSparse( + "sparse", cl::init(false), cl::sub(MergeSubcommand), + cl::desc("Generate a sparse profile (only meaningful for -instr)")); +cl::opt NumThreads( + "num-threads", cl::init(0), cl::sub(MergeSubcommand), + cl::desc("Number of merge threads to use (default: autodetect)")); +cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"), + cl::aliasopt(NumThreads)); + +cl::opt ProfileSymbolListFile( + "prof-sym-list", cl::init(""), cl::sub(MergeSubcommand), + cl::desc("Path to file containing the list of function symbols " + "used to populate profile symbol list")); + +cl::opt ProfileLayout( + "convert-sample-profile-layout", + cl::desc("Convert the generated profile to a profile with a new layout"), + cl::sub(MergeSubcommand), cl::init(SPL_None), + cl::values( + clEnumValN(SPL_Nest, "nest", + "Nested profile, the input should be CS flat profile"), + clEnumValN(SPL_Flat, "flat", + "Profile with nested inlinee flatten out"))); + +cl::opt DropProfileSymbolList( + "drop-profile-symbol-list", cl::init(false), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("Drop the profile symbol list when merging AutoFDO profiles " + "(only meaningful for -sample)")); + +cl::opt KeepVTableSymbols( + "keep-vtable-symbols", cl::init(false), cl::Hidden, + cl::sub(MergeSubcommand), + cl::desc("If true, keep the vtable symbols in indexed profiles")); + +// Temporary support for writing the previous version of the format, to enable +// some forward compatibility. +// TODO: Consider enabling this with future version changes as well, to ease +// deployment of newer versions of llvm-profdata. +cl::opt DoWritePrevVersion( + "write-prev-version", cl::init(false), cl::Hidden, + cl::desc("Write the previous version of indexed format, to enable " + "some forward compatibility.")); + +cl::opt MemProfVersionRequested( + "memprof-version", cl::Hidden, cl::sub(MergeSubcommand), + cl::desc("Specify the version of the memprof format to use"), + cl::init(memprof::Version0), + cl::values(clEnumValN(memprof::Version0, "0", "version 0"), + clEnumValN(memprof::Version1, "1", "version 1"), + clEnumValN(memprof::Version2, "2", "version 2"), + clEnumValN(memprof::Version3, "3", "version 3"))); + +cl::opt MemProfFullSchema( + "memprof-full-schema", cl::Hidden, cl::sub(MergeSubcommand), + cl::desc("Use the full schema for serialization"), cl::init(false)); + +// Options specific to overlap subcommand. +cl::opt BaseFilename(cl::Positional, cl::Required, + cl::desc(""), + cl::sub(OverlapSubcommand)); +cl::opt TestFilename(cl::Positional, cl::Required, + cl::desc(""), + cl::sub(OverlapSubcommand)); + +cl::opt SimilarityCutoff( + "similarity-cutoff", cl::init(0), + cl::desc("For sample profiles, list function names (with calling context " + "for csspgo) for overlapped functions " + "with similarities below the cutoff (percentage times 10000)."), + cl::sub(OverlapSubcommand)); + +cl::opt IsCS( + "cs", cl::init(false), + cl::desc("For context sensitive PGO counts. Does not work with CSSPGO."), + cl::sub(OverlapSubcommand)); + +cl::opt OverlapValueCutoff( + "value-cutoff", cl::init(-1), + cl::desc( + "Function level overlap information for every function (with calling " + "context for csspgo) in test " + "profile with max count value greater then the parameter value"), + cl::sub(OverlapSubcommand)); + +// Options specific to show subcommand. +cl::opt ShowCounts("counts", cl::init(false), + cl::desc("Show counter values for shown functions"), + cl::sub(ShowSubcommand)); +cl::opt + SFormat("show-format", cl::init(ShowFormat::Text), + cl::desc("Emit output in the selected format if supported"), + cl::sub(ShowSubcommand), + cl::values(clEnumValN(ShowFormat::Text, "text", + "emit normal text output (default)"), + clEnumValN(ShowFormat::Json, "json", "emit JSON"), + clEnumValN(ShowFormat::Yaml, "yaml", "emit YAML"))); +// TODO: Consider replacing this with `--show-format=text-encoding`. +cl::opt + TextFormat("text", cl::init(false), + cl::desc("Show instr profile data in text dump format"), + cl::sub(ShowSubcommand)); +cl::opt + JsonFormat("json", + cl::desc("Show sample profile data in the JSON format " + "(deprecated, please use --show-format=json)"), + cl::sub(ShowSubcommand)); +cl::opt ShowIndirectCallTargets( + "ic-targets", cl::init(false), + cl::desc("Show indirect call site target values for shown functions"), + cl::sub(ShowSubcommand)); +cl::opt ShowVTables("show-vtables", cl::init(false), + cl::desc("Show vtable names for shown functions"), + cl::sub(ShowSubcommand)); +cl::opt ShowMemOPSizes( + "memop-sizes", cl::init(false), + cl::desc("Show the profiled sizes of the memory intrinsic calls " + "for shown functions"), + cl::sub(ShowSubcommand)); +cl::opt ShowDetailedSummary("detailed-summary", cl::init(false), + cl::desc("Show detailed profile summary"), + cl::sub(ShowSubcommand)); +cl::list DetailedSummaryCutoffs( + cl::CommaSeparated, "detailed-summary-cutoffs", + cl::desc( + "Cutoff percentages (times 10000) for generating detailed summary"), + cl::value_desc("800000,901000,999999"), cl::sub(ShowSubcommand)); +cl::opt + ShowHotFuncList("hot-func-list", cl::init(false), + cl::desc("Show profile summary of a list of hot functions"), + cl::sub(ShowSubcommand)); +cl::opt ShowAllFunctions("all-functions", cl::init(false), + cl::desc("Details for each and every function"), + cl::sub(ShowSubcommand)); +cl::opt ShowCS("showcs", cl::init(false), + cl::desc("Show context sensitive counts"), + cl::sub(ShowSubcommand)); +cl::opt ShowProfileKind( + cl::desc("Profile kind supported by show:"), cl::sub(ShowSubcommand), + cl::init(instr), + cl::values(clEnumVal(instr, "Instrumentation profile (default)"), + clEnumVal(sample, "Sample profile"), + clEnumVal(memory, "MemProf memory access profile"))); +cl::opt TopNFunctions( + "topn", cl::init(0), + cl::desc("Show the list of functions with the largest internal counts"), + cl::sub(ShowSubcommand)); +cl::opt ShowValueCutoff( + "value-cutoff", cl::init(0), + cl::desc("Set the count value cutoff. Functions with the maximum count " + "less than this value will not be printed out. (Default is 0)"), + cl::sub(ShowSubcommand)); +cl::opt OnlyListBelow( + "list-below-cutoff", cl::init(false), + cl::desc("Only output names of functions whose max count values are " + "below the cutoff value"), + cl::sub(ShowSubcommand)); +cl::opt ShowProfileSymbolList( + "show-prof-sym-list", cl::init(false), + cl::desc("Show profile symbol list if it exists in the profile. "), + cl::sub(ShowSubcommand)); +cl::opt ShowSectionInfoOnly( + "show-sec-info-only", cl::init(false), + cl::desc("Show the information of each section in the sample profile. " + "The flag is only usable when the sample profile is in " + "extbinary format"), + cl::sub(ShowSubcommand)); +cl::opt ShowBinaryIds("binary-ids", cl::init(false), + cl::desc("Show binary ids in the profile. "), + cl::sub(ShowSubcommand)); +cl::opt ShowTemporalProfTraces( + "temporal-profile-traces", + cl::desc("Show temporal profile traces in the profile."), + cl::sub(ShowSubcommand)); + +cl::opt + ShowCovered("covered", cl::init(false), + cl::desc("Show only the functions that have been executed."), + cl::sub(ShowSubcommand)); + +cl::opt ShowProfileVersion("profile-version", cl::init(false), + cl::desc("Show profile version. "), + cl::sub(ShowSubcommand)); + +// Options specific to order subcommand. +cl::opt + NumTestTraces("num-test-traces", cl::init(0), + cl::desc("Keep aside the last traces in " + "the profile when computing the function order and " + "instead use them to evaluate that order"), + cl::sub(OrderSubcommand)); + +// We use this string to indicate that there are +// multiple static functions map to the same name. +const std::string DuplicateNameStr = "----"; + +static void warn(Twine Message, StringRef Whence = "", StringRef Hint = "") { + WithColor::warning(); + if (!Whence.empty()) + errs() << Whence << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint << "\n"; +} + +static void warn(Error E, StringRef Whence = "") { + if (E.isA()) { + handleAllErrors(std::move(E), [&](const InstrProfError &IPE) { + warn(IPE.message(), Whence); + }); + } +} + +static void exitWithError(Twine Message, StringRef Whence = "", + StringRef Hint = "") { + WithColor::error(); + if (!Whence.empty()) + errs() << Whence << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint << "\n"; + ::exit(1); +} + +static void exitWithError(Error E, StringRef Whence = "") { + if (E.isA()) { + handleAllErrors(std::move(E), [&](const InstrProfError &IPE) { + instrprof_error instrError = IPE.get(); + StringRef Hint = ""; + if (instrError == instrprof_error::unrecognized_format) { + // Hint in case user missed specifying the profile type. + Hint = "Perhaps you forgot to use the --sample or --memory option?"; + } + exitWithError(IPE.message(), Whence, Hint); + }); + return; + } + + exitWithError(toString(std::move(E)), Whence); +} + +static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") { + exitWithError(EC.message(), Whence); +} + +static void warnOrExitGivenError(FailureMode FailMode, std::error_code EC, + StringRef Whence = "") { + if (FailMode == failIfAnyAreInvalid) + exitWithErrorCode(EC, Whence); + else + warn(EC.message(), Whence); +} + +static void handleMergeWriterError(Error E, StringRef WhenceFile = "", + StringRef WhenceFunction = "", + bool ShowHint = true) { + if (!WhenceFile.empty()) + errs() << WhenceFile << ": "; + if (!WhenceFunction.empty()) + errs() << WhenceFunction << ": "; + + auto IPE = instrprof_error::success; + E = handleErrors(std::move(E), + [&IPE](std::unique_ptr E) -> Error { + IPE = E->get(); + return Error(std::move(E)); + }); + errs() << toString(std::move(E)) << "\n"; + + if (ShowHint) { + StringRef Hint = ""; + if (IPE != instrprof_error::success) { + switch (IPE) { + case instrprof_error::hash_mismatch: + case instrprof_error::count_mismatch: + case instrprof_error::value_site_count_mismatch: + Hint = "Make sure that all profile data to be merged is generated " + "from the same binary."; + break; + default: + break; + } + } + + if (!Hint.empty()) + errs() << Hint << "\n"; + } +} + +namespace { +/// A remapper from original symbol names to new symbol names based on a file +/// containing a list of mappings from old name to new name. +class SymbolRemapper { + std::unique_ptr File; + DenseMap RemappingTable; + +public: + /// Build a SymbolRemapper from a file containing a list of old/new symbols. + static std::unique_ptr create(StringRef InputFile) { + auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFile); + if (!BufOrError) + exitWithErrorCode(BufOrError.getError(), InputFile); + + auto Remapper = std::make_unique(); + Remapper->File = std::move(BufOrError.get()); + + for (line_iterator LineIt(*Remapper->File, /*SkipBlanks=*/true, '#'); + !LineIt.is_at_eof(); ++LineIt) { + std::pair Parts = LineIt->split(' '); + if (Parts.first.empty() || Parts.second.empty() || + Parts.second.count(' ')) { + exitWithError("unexpected line in remapping file", + (InputFile + ":" + Twine(LineIt.line_number())).str(), + "expected 'old_symbol new_symbol'"); + } + Remapper->RemappingTable.insert(Parts); + } + return Remapper; + } + + /// Attempt to map the given old symbol into a new symbol. + /// + /// \return The new symbol, or \p Name if no such symbol was found. + StringRef operator()(StringRef Name) { + StringRef New = RemappingTable.lookup(Name); + return New.empty() ? Name : New; + } + + FunctionId operator()(FunctionId Name) { + // MD5 name cannot be remapped. + if (!Name.isStringRef()) + return Name; + StringRef New = RemappingTable.lookup(Name.stringRef()); + return New.empty() ? Name : FunctionId(New); + } +}; +} + +struct WeightedFile { + std::string Filename; + uint64_t Weight; +}; +typedef SmallVector WeightedFileVector; + +/// Keep track of merged data and reported errors. +struct WriterContext { + std::mutex Lock; + InstrProfWriter Writer; + std::vector> Errors; + std::mutex &ErrLock; + SmallSet &WriterErrorCodes; + + WriterContext(bool IsSparse, std::mutex &ErrLock, + SmallSet &WriterErrorCodes, + uint64_t ReservoirSize = 0, uint64_t MaxTraceLength = 0) + : Writer(IsSparse, ReservoirSize, MaxTraceLength, DoWritePrevVersion, + MemProfVersionRequested, MemProfFullSchema), + ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {} +}; + +/// Computer the overlap b/w profile BaseFilename and TestFileName, +/// and store the program level result to Overlap. +static void overlapInput(const std::string &BaseFilename, + const std::string &TestFilename, WriterContext *WC, + OverlapStats &Overlap, + const OverlapFuncFilters &FuncFilter, + raw_fd_ostream &OS, bool IsCS) { + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = InstrProfReader::create(TestFilename, *FS); + if (Error E = ReaderOrErr.takeError()) { + // Skip the empty profiles by returning sliently. + auto [ErrorCode, Msg] = InstrProfError::take(std::move(E)); + if (ErrorCode != instrprof_error::empty_raw_profile) + WC->Errors.emplace_back(make_error(ErrorCode, Msg), + TestFilename); + return; + } + + auto Reader = std::move(ReaderOrErr.get()); + for (auto &I : *Reader) { + OverlapStats FuncOverlap(OverlapStats::FunctionLevel); + FuncOverlap.setFuncInfo(I.Name, I.Hash); + + WC->Writer.overlapRecord(std::move(I), Overlap, FuncOverlap, FuncFilter); + FuncOverlap.dump(OS); + } +} + +/// Load an input into a writer context. +static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper, + const InstrProfCorrelator *Correlator, + const StringRef ProfiledBinary, WriterContext *WC) { + std::unique_lock CtxGuard{WC->Lock}; + + // Copy the filename, because llvm::ThreadPool copied the input "const + // WeightedFile &" by value, making a reference to the filename within it + // invalid outside of this packaged task. + std::string Filename = Input.Filename; + + using ::llvm::memprof::RawMemProfReader; + if (RawMemProfReader::hasFormat(Input.Filename)) { + auto ReaderOrErr = RawMemProfReader::create(Input.Filename, ProfiledBinary); + if (!ReaderOrErr) { + exitWithError(ReaderOrErr.takeError(), Input.Filename); + } + std::unique_ptr Reader = std::move(ReaderOrErr.get()); + // Check if the profile types can be merged, e.g. clang frontend profiles + // should not be merged with memprof profiles. + if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) { + consumeError(std::move(E)); + WC->Errors.emplace_back( + make_error( + "Cannot merge MemProf profile with Clang generated profile.", + std::error_code()), + Filename); + return; + } + + auto MemProfError = [&](Error E) { + auto [ErrorCode, Msg] = InstrProfError::take(std::move(E)); + WC->Errors.emplace_back(make_error(ErrorCode, Msg), + Filename); + }; + + // Add the frame mappings into the writer context. + const auto &IdToFrame = Reader->getFrameMapping(); + for (const auto &I : IdToFrame) { + bool Succeeded = WC->Writer.addMemProfFrame( + /*Id=*/I.first, /*Frame=*/I.getSecond(), MemProfError); + // If we weren't able to add the frame mappings then it doesn't make sense + // to try to add the records from this profile. + if (!Succeeded) + return; + } + + // Add the call stacks into the writer context. + const auto &CSIdToCallStacks = Reader->getCallStacks(); + for (const auto &I : CSIdToCallStacks) { + bool Succeeded = WC->Writer.addMemProfCallStack( + /*Id=*/I.first, /*Frame=*/I.getSecond(), MemProfError); + // If we weren't able to add the call stacks then it doesn't make sense + // to try to add the records from this profile. + if (!Succeeded) + return; + } + + const auto &FunctionProfileData = Reader->getProfileData(); + // Add the memprof records into the writer context. + for (const auto &[GUID, Record] : FunctionProfileData) { + WC->Writer.addMemProfRecord(GUID, Record); + } + return; + } + + auto FS = vfs::getRealFileSystem(); + // TODO: This only saves the first non-fatal error from InstrProfReader, and + // then added to WriterContext::Errors. However, this is not extensible, if + // we have more non-fatal errors from InstrProfReader in the future. How + // should this interact with different -failure-mode? + std::optional> ReaderWarning; + auto Warn = [&](Error E) { + if (ReaderWarning) { + consumeError(std::move(E)); + return; + } + // Only show the first time an error occurs in this file. + auto [ErrCode, Msg] = InstrProfError::take(std::move(E)); + ReaderWarning = {make_error(ErrCode, Msg), Filename}; + }; + auto ReaderOrErr = + InstrProfReader::create(Input.Filename, *FS, Correlator, Warn); + if (Error E = ReaderOrErr.takeError()) { + // Skip the empty profiles by returning silently. + auto [ErrCode, Msg] = InstrProfError::take(std::move(E)); + if (ErrCode != instrprof_error::empty_raw_profile) + WC->Errors.emplace_back(make_error(ErrCode, Msg), + Filename); + return; + } + + auto Reader = std::move(ReaderOrErr.get()); + if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) { + consumeError(std::move(E)); + WC->Errors.emplace_back( + make_error( + "Merge IR generated profile with Clang generated profile.", + std::error_code()), + Filename); + return; + } + + for (auto &I : *Reader) { + if (Remapper) + I.Name = (*Remapper)(I.Name); + const StringRef FuncName = I.Name; + bool Reported = false; + WC->Writer.addRecord(std::move(I), Input.Weight, [&](Error E) { + if (Reported) { + consumeError(std::move(E)); + return; + } + Reported = true; + // Only show hint the first time an error occurs. + auto [ErrCode, Msg] = InstrProfError::take(std::move(E)); + std::unique_lock ErrGuard{WC->ErrLock}; + bool firstTime = WC->WriterErrorCodes.insert(ErrCode).second; + handleMergeWriterError(make_error(ErrCode, Msg), + Input.Filename, FuncName, firstTime); + }); + } + + if (KeepVTableSymbols) { + const InstrProfSymtab &symtab = Reader->getSymtab(); + const auto &VTableNames = symtab.getVTableNames(); + + for (const auto &kv : VTableNames) + WC->Writer.addVTableName(kv.getKey()); + } + + if (Reader->hasTemporalProfile()) { + auto &Traces = Reader->getTemporalProfTraces(Input.Weight); + if (!Traces.empty()) + WC->Writer.addTemporalProfileTraces( + Traces, Reader->getTemporalProfTraceStreamSize()); + } + if (Reader->hasError()) { + if (Error E = Reader->getError()) { + WC->Errors.emplace_back(std::move(E), Filename); + return; + } + } + + std::vector BinaryIds; + if (Error E = Reader->readBinaryIds(BinaryIds)) { + WC->Errors.emplace_back(std::move(E), Filename); + return; + } + WC->Writer.addBinaryIds(BinaryIds); + + if (ReaderWarning) { + WC->Errors.emplace_back(std::move(ReaderWarning->first), + ReaderWarning->second); + } +} + +/// Merge the \p Src writer context into \p Dst. +static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) { + for (auto &ErrorPair : Src->Errors) + Dst->Errors.push_back(std::move(ErrorPair)); + Src->Errors.clear(); + + if (Error E = Dst->Writer.mergeProfileKind(Src->Writer.getProfileKind())) + exitWithError(std::move(E)); + + Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer), [&](Error E) { + auto [ErrorCode, Msg] = InstrProfError::take(std::move(E)); + std::unique_lock ErrGuard{Dst->ErrLock}; + bool firstTime = Dst->WriterErrorCodes.insert(ErrorCode).second; + if (firstTime) + warn(toString(make_error(ErrorCode, Msg))); + }); +} + +static StringRef +getFuncName(const StringMap::value_type &Val) { + return Val.first(); +} + +static std::string +getFuncName(const SampleProfileMap::value_type &Val) { + return Val.second.getContext().toString(); +} + +template +static void filterFunctions(T &ProfileMap) { + bool hasFilter = !FuncNameFilter.empty(); + bool hasNegativeFilter = !FuncNameNegativeFilter.empty(); + if (!hasFilter && !hasNegativeFilter) + return; + + // If filter starts with '?' it is MSVC mangled name, not a regex. + llvm::Regex ProbablyMSVCMangledName("[?@$_0-9A-Za-z]+"); + if (hasFilter && FuncNameFilter[0] == '?' && + ProbablyMSVCMangledName.match(FuncNameFilter)) + FuncNameFilter = llvm::Regex::escape(FuncNameFilter); + if (hasNegativeFilter && FuncNameNegativeFilter[0] == '?' && + ProbablyMSVCMangledName.match(FuncNameNegativeFilter)) + FuncNameNegativeFilter = llvm::Regex::escape(FuncNameNegativeFilter); + + size_t Count = ProfileMap.size(); + llvm::Regex Pattern(FuncNameFilter); + llvm::Regex NegativePattern(FuncNameNegativeFilter); + std::string Error; + if (hasFilter && !Pattern.isValid(Error)) + exitWithError(Error); + if (hasNegativeFilter && !NegativePattern.isValid(Error)) + exitWithError(Error); + + // Handle MD5 profile, so it is still able to match using the original name. + std::string MD5Name = std::to_string(llvm::MD5Hash(FuncNameFilter)); + std::string NegativeMD5Name = + std::to_string(llvm::MD5Hash(FuncNameNegativeFilter)); + + for (auto I = ProfileMap.begin(); I != ProfileMap.end();) { + auto Tmp = I++; + const auto &FuncName = getFuncName(*Tmp); + // Negative filter has higher precedence than positive filter. + if ((hasNegativeFilter && + (NegativePattern.match(FuncName) || + (FunctionSamples::UseMD5 && NegativeMD5Name == FuncName))) || + (hasFilter && !(Pattern.match(FuncName) || + (FunctionSamples::UseMD5 && MD5Name == FuncName)))) + ProfileMap.erase(Tmp); + } + + llvm::dbgs() << Count - ProfileMap.size() << " of " << Count << " functions " + << "in the original profile are filtered.\n"; +} + +static void writeInstrProfile(StringRef OutputFilename, + ProfileFormat OutputFormat, + InstrProfWriter &Writer) { + std::error_code EC; + raw_fd_ostream Output(OutputFilename.data(), EC, + OutputFormat == PF_Text ? sys::fs::OF_TextWithCRLF + : sys::fs::OF_None); + if (EC) + exitWithErrorCode(EC, OutputFilename); + + if (OutputFormat == PF_Text) { + if (Error E = Writer.writeText(Output)) + warn(std::move(E)); + } else { + if (Output.is_displayed()) + exitWithError("cannot write a non-text format profile to the terminal"); + if (Error E = Writer.write(Output)) + warn(std::move(E)); + } +} + +static void mergeInstrProfile(const WeightedFileVector &Inputs, + SymbolRemapper *Remapper, + int MaxDbgCorrelationWarnings, + const StringRef ProfiledBinary) { + const uint64_t TraceReservoirSize = TemporalProfTraceReservoirSize.getValue(); + const uint64_t MaxTraceLength = TemporalProfMaxTraceLength.getValue(); + if (OutputFormat == PF_Compact_Binary) + exitWithError("Compact Binary is deprecated"); + if (OutputFormat != PF_Binary && OutputFormat != PF_Ext_Binary && + OutputFormat != PF_Text) + exitWithError("unknown format is specified"); + + // TODO: Maybe we should support correlation with mixture of different + // correlation modes(w/wo debug-info/object correlation). + if (!DebugInfoFilename.empty() && !BinaryFilename.empty()) + exitWithError("Expected only one of -debug-info, -binary-file"); + std::string CorrelateFilename; + ProfCorrelatorKind CorrelateKind = ProfCorrelatorKind::NONE; + if (!DebugInfoFilename.empty()) { + CorrelateFilename = DebugInfoFilename; + CorrelateKind = ProfCorrelatorKind::DEBUG_INFO; + } else if (!BinaryFilename.empty()) { + CorrelateFilename = BinaryFilename; + CorrelateKind = ProfCorrelatorKind::BINARY; + } + + std::unique_ptr Correlator; + if (CorrelateKind != InstrProfCorrelator::NONE) { + if (auto Err = InstrProfCorrelator::get(CorrelateFilename, CorrelateKind) + .moveInto(Correlator)) + exitWithError(std::move(Err), CorrelateFilename); + if (auto Err = Correlator->correlateProfileData(MaxDbgCorrelationWarnings)) + exitWithError(std::move(Err), CorrelateFilename); + } + + std::mutex ErrorLock; + SmallSet WriterErrorCodes; + + // If NumThreads is not specified, auto-detect a good default. + if (NumThreads == 0) + NumThreads = std::min(hardware_concurrency().compute_thread_count(), + unsigned((Inputs.size() + 1) / 2)); + + // Initialize the writer contexts. + SmallVector, 4> Contexts; + for (unsigned I = 0; I < NumThreads; ++I) + Contexts.emplace_back(std::make_unique( + OutputSparse, ErrorLock, WriterErrorCodes, TraceReservoirSize, + MaxTraceLength)); + + if (NumThreads == 1) { + for (const auto &Input : Inputs) + loadInput(Input, Remapper, Correlator.get(), ProfiledBinary, + Contexts[0].get()); + } else { + DefaultThreadPool Pool(hardware_concurrency(NumThreads)); + + // Load the inputs in parallel (N/NumThreads serial steps). + unsigned Ctx = 0; + for (const auto &Input : Inputs) { + Pool.async(loadInput, Input, Remapper, Correlator.get(), ProfiledBinary, + Contexts[Ctx].get()); + Ctx = (Ctx + 1) % NumThreads; + } + Pool.wait(); + + // Merge the writer contexts together (~ lg(NumThreads) serial steps). + unsigned Mid = Contexts.size() / 2; + unsigned End = Contexts.size(); + assert(Mid > 0 && "Expected more than one context"); + do { + for (unsigned I = 0; I < Mid; ++I) + Pool.async(mergeWriterContexts, Contexts[I].get(), + Contexts[I + Mid].get()); + Pool.wait(); + if (End & 1) { + Pool.async(mergeWriterContexts, Contexts[0].get(), + Contexts[End - 1].get()); + Pool.wait(); + } + End = Mid; + Mid /= 2; + } while (Mid > 0); + } + + // Handle deferred errors encountered during merging. If the number of errors + // is equal to the number of inputs the merge failed. + unsigned NumErrors = 0; + for (std::unique_ptr &WC : Contexts) { + for (auto &ErrorPair : WC->Errors) { + ++NumErrors; + warn(toString(std::move(ErrorPair.first)), ErrorPair.second); + } + } + if ((NumErrors == Inputs.size() && FailMode == failIfAllAreInvalid) || + (NumErrors > 0 && FailMode == failIfAnyAreInvalid)) + exitWithError("no profile can be merged"); + + filterFunctions(Contexts[0]->Writer.getProfileData()); + + writeInstrProfile(OutputFilename, OutputFormat, Contexts[0]->Writer); +} + +/// The profile entry for a function in instrumentation profile. +struct InstrProfileEntry { + uint64_t MaxCount = 0; + uint64_t NumEdgeCounters = 0; + float ZeroCounterRatio = 0.0; + InstrProfRecord *ProfRecord; + InstrProfileEntry(InstrProfRecord *Record); + InstrProfileEntry() = default; +}; + +InstrProfileEntry::InstrProfileEntry(InstrProfRecord *Record) { + ProfRecord = Record; + uint64_t CntNum = Record->Counts.size(); + uint64_t ZeroCntNum = 0; + for (size_t I = 0; I < CntNum; ++I) { + MaxCount = std::max(MaxCount, Record->Counts[I]); + ZeroCntNum += !Record->Counts[I]; + } + ZeroCounterRatio = (float)ZeroCntNum / CntNum; + NumEdgeCounters = CntNum; +} + +/// Either set all the counters in the instr profile entry \p IFE to +/// -1 / -2 /in order to drop the profile or scale up the +/// counters in \p IFP to be above hot / cold threshold. We use +/// the ratio of zero counters in the profile of a function to +/// decide the profile is helpful or harmful for performance, +/// and to choose whether to scale up or drop it. +static void updateInstrProfileEntry(InstrProfileEntry &IFE, bool SetToHot, + uint64_t HotInstrThreshold, + uint64_t ColdInstrThreshold, + float ZeroCounterThreshold) { + InstrProfRecord *ProfRecord = IFE.ProfRecord; + if (!IFE.MaxCount || IFE.ZeroCounterRatio > ZeroCounterThreshold) { + // If all or most of the counters of the function are zero, the + // profile is unaccountable and should be dropped. Reset all the + // counters to be -1 / -2 and PGO profile-use will drop the profile. + // All counters being -1 also implies that the function is hot so + // PGO profile-use will also set the entry count metadata to be + // above hot threshold. + // All counters being -2 implies that the function is warm so + // PGO profile-use will also set the entry count metadata to be + // above cold threshold. + auto Kind = + (SetToHot ? InstrProfRecord::PseudoHot : InstrProfRecord::PseudoWarm); + ProfRecord->setPseudoCount(Kind); + return; + } + + // Scale up the MaxCount to be multiple times above hot / cold threshold. + const unsigned MultiplyFactor = 3; + uint64_t Threshold = (SetToHot ? HotInstrThreshold : ColdInstrThreshold); + uint64_t Numerator = Threshold * MultiplyFactor; + + // Make sure Threshold for warm counters is below the HotInstrThreshold. + if (!SetToHot && Threshold >= HotInstrThreshold) { + Threshold = (HotInstrThreshold + ColdInstrThreshold) / 2; + } + + uint64_t Denominator = IFE.MaxCount; + if (Numerator <= Denominator) + return; + ProfRecord->scale(Numerator, Denominator, [&](instrprof_error E) { + warn(toString(make_error(E))); + }); +} + +const uint64_t ColdPercentileIdx = 15; +const uint64_t HotPercentileIdx = 11; + +using sampleprof::FSDiscriminatorPass; + +// Internal options to set FSDiscriminatorPass. Used in merge and show +// commands. +static cl::opt FSDiscriminatorPassOption( + "fs-discriminator-pass", cl::init(PassLast), cl::Hidden, + cl::desc("Zero out the discriminator bits for the FS discrimiantor " + "pass beyond this value. The enum values are defined in " + "Support/Discriminator.h"), + cl::values(clEnumVal(Base, "Use base discriminators only"), + clEnumVal(Pass1, "Use base and pass 1 discriminators"), + clEnumVal(Pass2, "Use base and pass 1-2 discriminators"), + clEnumVal(Pass3, "Use base and pass 1-3 discriminators"), + clEnumVal(PassLast, "Use all discriminator bits (default)"))); + +static unsigned getDiscriminatorMask() { + return getN1Bits(getFSPassBitEnd(FSDiscriminatorPassOption.getValue())); +} + +/// Adjust the instr profile in \p WC based on the sample profile in +/// \p Reader. +static void +adjustInstrProfile(std::unique_ptr &WC, + std::unique_ptr &Reader, + unsigned SupplMinSizeThreshold, float ZeroCounterThreshold, + unsigned InstrProfColdThreshold) { + // Function to its entry in instr profile. + StringMap InstrProfileMap; + StringMap StaticFuncMap; + InstrProfSummaryBuilder IPBuilder(ProfileSummaryBuilder::DefaultCutoffs); + + auto checkSampleProfileHasFUnique = [&Reader]() { + for (const auto &PD : Reader->getProfiles()) { + auto &FContext = PD.second.getContext(); + if (FContext.toString().find(FunctionSamples::UniqSuffix) != + std::string::npos) { + return true; + } + } + return false; + }; + + bool SampleProfileHasFUnique = checkSampleProfileHasFUnique(); + + auto buildStaticFuncMap = [&StaticFuncMap, + SampleProfileHasFUnique](const StringRef Name) { + std::string FilePrefixes[] = {".cpp", "cc", ".c", ".hpp", ".h"}; + size_t PrefixPos = StringRef::npos; + for (auto &FilePrefix : FilePrefixes) { + std::string NamePrefix = FilePrefix + GlobalIdentifierDelimiter; + PrefixPos = Name.find_insensitive(NamePrefix); + if (PrefixPos == StringRef::npos) + continue; + PrefixPos += NamePrefix.size(); + break; + } + + if (PrefixPos == StringRef::npos) { + return; + } + + StringRef NewName = Name.drop_front(PrefixPos); + StringRef FName = Name.substr(0, PrefixPos - 1); + if (NewName.size() == 0) { + return; + } + + // This name should have a static linkage. + size_t PostfixPos = NewName.find(FunctionSamples::UniqSuffix); + bool ProfileHasFUnique = (PostfixPos != StringRef::npos); + + // If sample profile and instrumented profile do not agree on symbol + // uniqification. + if (SampleProfileHasFUnique != ProfileHasFUnique) { + // If instrumented profile uses -funique-internal-linkage-symbols, + // we need to trim the name. + if (ProfileHasFUnique) { + NewName = NewName.substr(0, PostfixPos); + } else { + // If sample profile uses -funique-internal-linkage-symbols, + // we build the map. + std::string NStr = + NewName.str() + getUniqueInternalLinkagePostfix(FName); + NewName = StringRef(NStr); + StaticFuncMap[NewName] = Name; + return; + } + } + + if (!StaticFuncMap.contains(NewName)) { + StaticFuncMap[NewName] = Name; + } else { + StaticFuncMap[NewName] = DuplicateNameStr; + } + }; + + // We need to flatten the SampleFDO profile as the InstrFDO + // profile does not have inlined callsite profiles. + // One caveat is the pre-inlined function -- their samples + // should be collapsed into the caller function. + // Here we do a DFS traversal to get the flatten profile + // info: the sum of entrycount and the max of maxcount. + // Here is the algorithm: + // recursive (FS, root_name) { + // name = FS->getName(); + // get samples for FS; + // if (InstrProf.find(name) { + // root_name = name; + // } else { + // if (name is in static_func map) { + // root_name = static_name; + // } + // } + // update the Map entry for root_name; + // for (subfs: FS) { + // recursive(subfs, root_name); + // } + // } + // + // Here is an example. + // + // SampleProfile: + // foo:12345:1000 + // 1: 1000 + // 2.1: 1000 + // 15: 5000 + // 4: bar:1000 + // 1: 1000 + // 2: goo:3000 + // 1: 3000 + // 8: bar:40000 + // 1: 10000 + // 2: goo:30000 + // 1: 30000 + // + // InstrProfile has two entries: + // foo + // bar.cc;bar + // + // After BuildMaxSampleMap, we should have the following in FlattenSampleMap: + // {"foo", {1000, 5000}} + // {"bar.cc;bar", {11000, 30000}} + // + // foo's has an entry count of 1000, and max body count of 5000. + // bar.cc;bar has an entry count of 11000 (sum two callsites of 1000 and + // 10000), and max count of 30000 (from the callsite in line 8). + // + // Note that goo's count will remain in bar.cc;bar() as it does not have an + // entry in InstrProfile. + llvm::StringMap> FlattenSampleMap; + auto BuildMaxSampleMap = [&FlattenSampleMap, &StaticFuncMap, + &InstrProfileMap](const FunctionSamples &FS, + const StringRef &RootName) { + auto BuildMaxSampleMapImpl = [&](const FunctionSamples &FS, + const StringRef &RootName, + auto &BuildImpl) -> void { + std::string NameStr = FS.getFunction().str(); + const StringRef Name = NameStr; + const StringRef *NewRootName = &RootName; + uint64_t EntrySample = FS.getHeadSamplesEstimate(); + uint64_t MaxBodySample = FS.getMaxCountInside(/* SkipCallSite*/ true); + + auto It = InstrProfileMap.find(Name); + if (It != InstrProfileMap.end()) { + NewRootName = &Name; + } else { + auto NewName = StaticFuncMap.find(Name); + if (NewName != StaticFuncMap.end()) { + It = InstrProfileMap.find(NewName->second.str()); + if (NewName->second != DuplicateNameStr) { + NewRootName = &NewName->second; + } + } else { + // Here the EntrySample is of an inlined function, so we should not + // update the EntrySample in the map. + EntrySample = 0; + } + } + EntrySample += FlattenSampleMap[*NewRootName].first; + MaxBodySample = + std::max(FlattenSampleMap[*NewRootName].second, MaxBodySample); + FlattenSampleMap[*NewRootName] = + std::make_pair(EntrySample, MaxBodySample); + + for (const auto &C : FS.getCallsiteSamples()) + for (const auto &F : C.second) + BuildImpl(F.second, *NewRootName, BuildImpl); + }; + BuildMaxSampleMapImpl(FS, RootName, BuildMaxSampleMapImpl); + }; + + for (auto &PD : WC->Writer.getProfileData()) { + // Populate IPBuilder. + for (const auto &PDV : PD.getValue()) { + InstrProfRecord Record = PDV.second; + IPBuilder.addRecord(Record); + } + + // If a function has multiple entries in instr profile, skip it. + if (PD.getValue().size() != 1) + continue; + + // Initialize InstrProfileMap. + InstrProfRecord *R = &PD.getValue().begin()->second; + StringRef FullName = PD.getKey(); + InstrProfileMap[FullName] = InstrProfileEntry(R); + buildStaticFuncMap(FullName); + } + + for (auto &PD : Reader->getProfiles()) { + sampleprof::FunctionSamples &FS = PD.second; + std::string Name = FS.getFunction().str(); + BuildMaxSampleMap(FS, Name); + } + + ProfileSummary InstrPS = *IPBuilder.getSummary(); + ProfileSummary SamplePS = Reader->getSummary(); + + // Compute cold thresholds for instr profile and sample profile. + uint64_t HotSampleThreshold = + ProfileSummaryBuilder::getEntryForPercentile( + SamplePS.getDetailedSummary(), + ProfileSummaryBuilder::DefaultCutoffs[HotPercentileIdx]) + .MinCount; + uint64_t ColdSampleThreshold = + ProfileSummaryBuilder::getEntryForPercentile( + SamplePS.getDetailedSummary(), + ProfileSummaryBuilder::DefaultCutoffs[ColdPercentileIdx]) + .MinCount; + uint64_t HotInstrThreshold = + ProfileSummaryBuilder::getEntryForPercentile( + InstrPS.getDetailedSummary(), + ProfileSummaryBuilder::DefaultCutoffs[HotPercentileIdx]) + .MinCount; + uint64_t ColdInstrThreshold = + InstrProfColdThreshold + ? InstrProfColdThreshold + : ProfileSummaryBuilder::getEntryForPercentile( + InstrPS.getDetailedSummary(), + ProfileSummaryBuilder::DefaultCutoffs[ColdPercentileIdx]) + .MinCount; + + // Find hot/warm functions in sample profile which is cold in instr profile + // and adjust the profiles of those functions in the instr profile. + for (const auto &E : FlattenSampleMap) { + uint64_t SampleMaxCount = std::max(E.second.first, E.second.second); + if (SampleMaxCount < ColdSampleThreshold) + continue; + StringRef Name = E.first(); + auto It = InstrProfileMap.find(Name); + if (It == InstrProfileMap.end()) { + auto NewName = StaticFuncMap.find(Name); + if (NewName != StaticFuncMap.end()) { + It = InstrProfileMap.find(NewName->second.str()); + if (NewName->second == DuplicateNameStr) { + WithColor::warning() + << "Static function " << Name + << " has multiple promoted names, cannot adjust profile.\n"; + } + } + } + if (It == InstrProfileMap.end() || + It->second.MaxCount > ColdInstrThreshold || + It->second.NumEdgeCounters < SupplMinSizeThreshold) + continue; + bool SetToHot = SampleMaxCount >= HotSampleThreshold; + updateInstrProfileEntry(It->second, SetToHot, HotInstrThreshold, + ColdInstrThreshold, ZeroCounterThreshold); + } +} + +/// The main function to supplement instr profile with sample profile. +/// \Inputs contains the instr profile. \p SampleFilename specifies the +/// sample profile. \p OutputFilename specifies the output profile name. +/// \p OutputFormat specifies the output profile format. \p OutputSparse +/// specifies whether to generate sparse profile. \p SupplMinSizeThreshold +/// specifies the minimal size for the functions whose profile will be +/// adjusted. \p ZeroCounterThreshold is the threshold to check whether +/// a function contains too many zero counters and whether its profile +/// should be dropped. \p InstrProfColdThreshold is the user specified +/// cold threshold which will override the cold threshold got from the +/// instr profile summary. +static void supplementInstrProfile(const WeightedFileVector &Inputs, + StringRef SampleFilename, bool OutputSparse, + unsigned SupplMinSizeThreshold, + float ZeroCounterThreshold, + unsigned InstrProfColdThreshold) { + if (OutputFilename == "-") + exitWithError("cannot write indexed profdata format to stdout"); + if (Inputs.size() != 1) + exitWithError("expect one input to be an instr profile"); + if (Inputs[0].Weight != 1) + exitWithError("expect instr profile doesn't have weight"); + + StringRef InstrFilename = Inputs[0].Filename; + + // Read sample profile. + LLVMContext Context; + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = sampleprof::SampleProfileReader::create( + SampleFilename.str(), Context, *FS, FSDiscriminatorPassOption); + if (std::error_code EC = ReaderOrErr.getError()) + exitWithErrorCode(EC, SampleFilename); + auto Reader = std::move(ReaderOrErr.get()); + if (std::error_code EC = Reader->read()) + exitWithErrorCode(EC, SampleFilename); + + // Read instr profile. + std::mutex ErrorLock; + SmallSet WriterErrorCodes; + auto WC = std::make_unique(OutputSparse, ErrorLock, + WriterErrorCodes); + loadInput(Inputs[0], nullptr, nullptr, /*ProfiledBinary=*/"", WC.get()); + if (WC->Errors.size() > 0) + exitWithError(std::move(WC->Errors[0].first), InstrFilename); + + adjustInstrProfile(WC, Reader, SupplMinSizeThreshold, ZeroCounterThreshold, + InstrProfColdThreshold); + writeInstrProfile(OutputFilename, OutputFormat, WC->Writer); +} + +/// Make a copy of the given function samples with all symbol names remapped +/// by the provided symbol remapper. +static sampleprof::FunctionSamples +remapSamples(const sampleprof::FunctionSamples &Samples, + SymbolRemapper &Remapper, sampleprof_error &Error) { + sampleprof::FunctionSamples Result; + Result.setFunction(Remapper(Samples.getFunction())); + Result.addTotalSamples(Samples.getTotalSamples()); + Result.addHeadSamples(Samples.getHeadSamples()); + for (const auto &BodySample : Samples.getBodySamples()) { + uint32_t MaskedDiscriminator = + BodySample.first.Discriminator & getDiscriminatorMask(); + Result.addBodySamples(BodySample.first.LineOffset, MaskedDiscriminator, + BodySample.second.getSamples()); + for (const auto &Target : BodySample.second.getCallTargets()) { + Result.addCalledTargetSamples(BodySample.first.LineOffset, + MaskedDiscriminator, + Remapper(Target.first), Target.second); + } + } + for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) { + sampleprof::FunctionSamplesMap &Target = + Result.functionSamplesAt(CallsiteSamples.first); + for (const auto &Callsite : CallsiteSamples.second) { + sampleprof::FunctionSamples Remapped = + remapSamples(Callsite.second, Remapper, Error); + mergeSampleProfErrors(Error, + Target[Remapped.getFunction()].merge(Remapped)); + } + } + return Result; +} + +static sampleprof::SampleProfileFormat FormatMap[] = { + sampleprof::SPF_None, + sampleprof::SPF_Text, + sampleprof::SPF_None, + sampleprof::SPF_Ext_Binary, + sampleprof::SPF_GCC, + sampleprof::SPF_Binary}; + +static std::unique_ptr +getInputFileBuf(const StringRef &InputFile) { + if (InputFile == "") + return {}; + + auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFile); + if (!BufOrError) + exitWithErrorCode(BufOrError.getError(), InputFile); + + return std::move(*BufOrError); +} + +static void populateProfileSymbolList(MemoryBuffer *Buffer, + sampleprof::ProfileSymbolList &PSL) { + if (!Buffer) + return; + + SmallVector SymbolVec; + StringRef Data = Buffer->getBuffer(); + Data.split(SymbolVec, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/false); + + for (StringRef SymbolStr : SymbolVec) + PSL.add(SymbolStr.trim()); +} + +static void handleExtBinaryWriter(sampleprof::SampleProfileWriter &Writer, + ProfileFormat OutputFormat, + MemoryBuffer *Buffer, + sampleprof::ProfileSymbolList &WriterList, + bool CompressAllSections, bool UseMD5, + bool GenPartialProfile) { + populateProfileSymbolList(Buffer, WriterList); + if (WriterList.size() > 0 && OutputFormat != PF_Ext_Binary) + warn("Profile Symbol list is not empty but the output format is not " + "ExtBinary format. The list will be lost in the output. "); + + Writer.setProfileSymbolList(&WriterList); + + if (CompressAllSections) { + if (OutputFormat != PF_Ext_Binary) + warn("-compress-all-section is ignored. Specify -extbinary to enable it"); + else + Writer.setToCompressAllSections(); + } + if (UseMD5) { + if (OutputFormat != PF_Ext_Binary) + warn("-use-md5 is ignored. Specify -extbinary to enable it"); + else + Writer.setUseMD5(); + } + if (GenPartialProfile) { + if (OutputFormat != PF_Ext_Binary) + warn("-gen-partial-profile is ignored. Specify -extbinary to enable it"); + else + Writer.setPartialProfile(); + } +} + +static void mergeSampleProfile(const WeightedFileVector &Inputs, + SymbolRemapper *Remapper, + StringRef ProfileSymbolListFile, + size_t OutputSizeLimit) { + using namespace sampleprof; + SampleProfileMap ProfileMap; + SmallVector, 5> Readers; + LLVMContext Context; + sampleprof::ProfileSymbolList WriterList; + std::optional ProfileIsProbeBased; + std::optional ProfileIsCS; + for (const auto &Input : Inputs) { + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = SampleProfileReader::create(Input.Filename, Context, *FS, + FSDiscriminatorPassOption); + if (std::error_code EC = ReaderOrErr.getError()) { + warnOrExitGivenError(FailMode, EC, Input.Filename); + continue; + } + + // We need to keep the readers around until after all the files are + // read so that we do not lose the function names stored in each + // reader's memory. The function names are needed to write out the + // merged profile map. + Readers.push_back(std::move(ReaderOrErr.get())); + const auto Reader = Readers.back().get(); + if (std::error_code EC = Reader->read()) { + warnOrExitGivenError(FailMode, EC, Input.Filename); + Readers.pop_back(); + continue; + } + + SampleProfileMap &Profiles = Reader->getProfiles(); + if (ProfileIsProbeBased && + ProfileIsProbeBased != FunctionSamples::ProfileIsProbeBased) + exitWithError( + "cannot merge probe-based profile with non-probe-based profile"); + ProfileIsProbeBased = FunctionSamples::ProfileIsProbeBased; + if (ProfileIsCS && ProfileIsCS != FunctionSamples::ProfileIsCS) + exitWithError("cannot merge CS profile with non-CS profile"); + ProfileIsCS = FunctionSamples::ProfileIsCS; + for (SampleProfileMap::iterator I = Profiles.begin(), E = Profiles.end(); + I != E; ++I) { + sampleprof_error Result = sampleprof_error::success; + FunctionSamples Remapped = + Remapper ? remapSamples(I->second, *Remapper, Result) + : FunctionSamples(); + FunctionSamples &Samples = Remapper ? Remapped : I->second; + SampleContext FContext = Samples.getContext(); + mergeSampleProfErrors(Result, + ProfileMap[FContext].merge(Samples, Input.Weight)); + if (Result != sampleprof_error::success) { + std::error_code EC = make_error_code(Result); + handleMergeWriterError(errorCodeToError(EC), Input.Filename, + FContext.toString()); + } + } + + if (!DropProfileSymbolList) { + std::unique_ptr ReaderList = + Reader->getProfileSymbolList(); + if (ReaderList) + WriterList.merge(*ReaderList); + } + } + + if (ProfileIsCS && (SampleMergeColdContext || SampleTrimColdContext)) { + // Use threshold calculated from profile summary unless specified. + SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); + auto Summary = Builder.computeSummaryForProfiles(ProfileMap); + uint64_t SampleProfColdThreshold = + ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); + + // Trim and merge cold context profile using cold threshold above; + SampleContextTrimmer(ProfileMap) + .trimAndMergeColdContextProfiles( + SampleProfColdThreshold, SampleTrimColdContext, + SampleMergeColdContext, SampleColdContextFrameDepth, false); + } + + if (ProfileLayout == llvm::sampleprof::SPL_Flat) { + ProfileConverter::flattenProfile(ProfileMap, FunctionSamples::ProfileIsCS); + ProfileIsCS = FunctionSamples::ProfileIsCS = false; + } else if (ProfileIsCS && ProfileLayout == llvm::sampleprof::SPL_Nest) { + ProfileConverter CSConverter(ProfileMap); + CSConverter.convertCSProfiles(); + ProfileIsCS = FunctionSamples::ProfileIsCS = false; + } + + filterFunctions(ProfileMap); + + auto WriterOrErr = + SampleProfileWriter::create(OutputFilename, FormatMap[OutputFormat]); + if (std::error_code EC = WriterOrErr.getError()) + exitWithErrorCode(EC, OutputFilename); + + auto Writer = std::move(WriterOrErr.get()); + // WriterList will have StringRef refering to string in Buffer. + // Make sure Buffer lives as long as WriterList. + auto Buffer = getInputFileBuf(ProfileSymbolListFile); + handleExtBinaryWriter(*Writer, OutputFormat, Buffer.get(), WriterList, + CompressAllSections, UseMD5, GenPartialProfile); + + // If OutputSizeLimit is 0 (default), it is the same as write(). + if (std::error_code EC = + Writer->writeWithSizeLimit(ProfileMap, OutputSizeLimit)) + exitWithErrorCode(EC); +} + +static WeightedFile parseWeightedFile(const StringRef &WeightedFilename) { + StringRef WeightStr, FileName; + std::tie(WeightStr, FileName) = WeightedFilename.split(','); + + uint64_t Weight; + if (WeightStr.getAsInteger(10, Weight) || Weight < 1) + exitWithError("input weight must be a positive integer"); + + return {std::string(FileName), Weight}; +} + +static void addWeightedInput(WeightedFileVector &WNI, const WeightedFile &WF) { + StringRef Filename = WF.Filename; + uint64_t Weight = WF.Weight; + + // If it's STDIN just pass it on. + if (Filename == "-") { + WNI.push_back({std::string(Filename), Weight}); + return; + } + + llvm::sys::fs::file_status Status; + llvm::sys::fs::status(Filename, Status); + if (!llvm::sys::fs::exists(Status)) + exitWithErrorCode(make_error_code(errc::no_such_file_or_directory), + Filename); + // If it's a source file, collect it. + if (llvm::sys::fs::is_regular_file(Status)) { + WNI.push_back({std::string(Filename), Weight}); + return; + } + + if (llvm::sys::fs::is_directory(Status)) { + std::error_code EC; + for (llvm::sys::fs::recursive_directory_iterator F(Filename, EC), E; + F != E && !EC; F.increment(EC)) { + if (llvm::sys::fs::is_regular_file(F->path())) { + addWeightedInput(WNI, {F->path(), Weight}); + } + } + if (EC) + exitWithErrorCode(EC, Filename); + } +} + +static void parseInputFilenamesFile(MemoryBuffer *Buffer, + WeightedFileVector &WFV) { + if (!Buffer) + return; + + SmallVector Entries; + StringRef Data = Buffer->getBuffer(); + Data.split(Entries, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/false); + for (const StringRef &FileWeightEntry : Entries) { + StringRef SanitizedEntry = FileWeightEntry.trim(" \t\v\f\r"); + // Skip comments. + if (SanitizedEntry.starts_with("#")) + continue; + // If there's no comma, it's an unweighted profile. + else if (!SanitizedEntry.contains(',')) + addWeightedInput(WFV, {std::string(SanitizedEntry), 1}); + else + addWeightedInput(WFV, parseWeightedFile(SanitizedEntry)); + } +} + +static int merge_main(StringRef ProgName) { + WeightedFileVector WeightedInputs; + for (StringRef Filename : InputFilenames) + addWeightedInput(WeightedInputs, {std::string(Filename), 1}); + for (StringRef WeightedFilename : WeightedInputFilenames) + addWeightedInput(WeightedInputs, parseWeightedFile(WeightedFilename)); + + // Make sure that the file buffer stays alive for the duration of the + // weighted input vector's lifetime. + auto Buffer = getInputFileBuf(InputFilenamesFile); + parseInputFilenamesFile(Buffer.get(), WeightedInputs); + + if (WeightedInputs.empty()) + exitWithError("no input files specified. See " + ProgName + " merge -help"); + + if (DumpInputFileList) { + for (auto &WF : WeightedInputs) + outs() << WF.Weight << "," << WF.Filename << "\n"; + return 0; + } + + std::unique_ptr Remapper; + if (!RemappingFile.empty()) + Remapper = SymbolRemapper::create(RemappingFile); + + if (!SupplInstrWithSample.empty()) { + if (ProfileKind != instr) + exitWithError( + "-supplement-instr-with-sample can only work with -instr. "); + + supplementInstrProfile(WeightedInputs, SupplInstrWithSample, OutputSparse, + SupplMinSizeThreshold, ZeroCounterThreshold, + InstrProfColdThreshold); + return 0; + } + + if (ProfileKind == instr) + mergeInstrProfile(WeightedInputs, Remapper.get(), MaxDbgCorrelationWarnings, + ProfiledBinary); + else + mergeSampleProfile(WeightedInputs, Remapper.get(), ProfileSymbolListFile, + OutputSizeLimit); + return 0; +} + +/// Computer the overlap b/w profile BaseFilename and profile TestFilename. +static void overlapInstrProfile(const std::string &BaseFilename, + const std::string &TestFilename, + const OverlapFuncFilters &FuncFilter, + raw_fd_ostream &OS, bool IsCS) { + std::mutex ErrorLock; + SmallSet WriterErrorCodes; + WriterContext Context(false, ErrorLock, WriterErrorCodes); + WeightedFile WeightedInput{BaseFilename, 1}; + OverlapStats Overlap; + Error E = Overlap.accumulateCounts(BaseFilename, TestFilename, IsCS); + if (E) + exitWithError(std::move(E), "error in getting profile count sums"); + if (Overlap.Base.CountSum < 1.0f) { + OS << "Sum of edge counts for profile " << BaseFilename << " is 0.\n"; + exit(0); + } + if (Overlap.Test.CountSum < 1.0f) { + OS << "Sum of edge counts for profile " << TestFilename << " is 0.\n"; + exit(0); + } + loadInput(WeightedInput, nullptr, nullptr, /*ProfiledBinary=*/"", &Context); + overlapInput(BaseFilename, TestFilename, &Context, Overlap, FuncFilter, OS, + IsCS); + Overlap.dump(OS); +} + +namespace { +struct SampleOverlapStats { + SampleContext BaseName; + SampleContext TestName; + // Number of overlap units + uint64_t OverlapCount = 0; + // Total samples of overlap units + uint64_t OverlapSample = 0; + // Number of and total samples of units that only present in base or test + // profile + uint64_t BaseUniqueCount = 0; + uint64_t BaseUniqueSample = 0; + uint64_t TestUniqueCount = 0; + uint64_t TestUniqueSample = 0; + // Number of units and total samples in base or test profile + uint64_t BaseCount = 0; + uint64_t BaseSample = 0; + uint64_t TestCount = 0; + uint64_t TestSample = 0; + // Number of and total samples of units that present in at least one profile + uint64_t UnionCount = 0; + uint64_t UnionSample = 0; + // Weighted similarity + double Similarity = 0.0; + // For SampleOverlapStats instances representing functions, weights of the + // function in base and test profiles + double BaseWeight = 0.0; + double TestWeight = 0.0; + + SampleOverlapStats() = default; +}; +} // end anonymous namespace + +namespace { +struct FuncSampleStats { + uint64_t SampleSum = 0; + uint64_t MaxSample = 0; + uint64_t HotBlockCount = 0; + FuncSampleStats() = default; + FuncSampleStats(uint64_t SampleSum, uint64_t MaxSample, + uint64_t HotBlockCount) + : SampleSum(SampleSum), MaxSample(MaxSample), + HotBlockCount(HotBlockCount) {} +}; +} // end anonymous namespace + +namespace { +enum MatchStatus { MS_Match, MS_FirstUnique, MS_SecondUnique, MS_None }; + +// Class for updating merging steps for two sorted maps. The class should be +// instantiated with a map iterator type. +template class MatchStep { +public: + MatchStep() = delete; + + MatchStep(T FirstIter, T FirstEnd, T SecondIter, T SecondEnd) + : FirstIter(FirstIter), FirstEnd(FirstEnd), SecondIter(SecondIter), + SecondEnd(SecondEnd), Status(MS_None) {} + + bool areBothFinished() const { + return (FirstIter == FirstEnd && SecondIter == SecondEnd); + } + + bool isFirstFinished() const { return FirstIter == FirstEnd; } + + bool isSecondFinished() const { return SecondIter == SecondEnd; } + + /// Advance one step based on the previous match status unless the previous + /// status is MS_None. Then update Status based on the comparison between two + /// container iterators at the current step. If the previous status is + /// MS_None, it means two iterators are at the beginning and no comparison has + /// been made, so we simply update Status without advancing the iterators. + void updateOneStep(); + + T getFirstIter() const { return FirstIter; } + + T getSecondIter() const { return SecondIter; } + + MatchStatus getMatchStatus() const { return Status; } + +private: + // Current iterator and end iterator of the first container. + T FirstIter; + T FirstEnd; + // Current iterator and end iterator of the second container. + T SecondIter; + T SecondEnd; + // Match status of the current step. + MatchStatus Status; +}; +} // end anonymous namespace + +template void MatchStep::updateOneStep() { + switch (Status) { + case MS_Match: + ++FirstIter; + ++SecondIter; + break; + case MS_FirstUnique: + ++FirstIter; + break; + case MS_SecondUnique: + ++SecondIter; + break; + case MS_None: + break; + } + + // Update Status according to iterators at the current step. + if (areBothFinished()) + return; + if (FirstIter != FirstEnd && + (SecondIter == SecondEnd || FirstIter->first < SecondIter->first)) + Status = MS_FirstUnique; + else if (SecondIter != SecondEnd && + (FirstIter == FirstEnd || SecondIter->first < FirstIter->first)) + Status = MS_SecondUnique; + else + Status = MS_Match; +} + +// Return the sum of line/block samples, the max line/block sample, and the +// number of line/block samples above the given threshold in a function +// including its inlinees. +static void getFuncSampleStats(const sampleprof::FunctionSamples &Func, + FuncSampleStats &FuncStats, + uint64_t HotThreshold) { + for (const auto &L : Func.getBodySamples()) { + uint64_t Sample = L.second.getSamples(); + FuncStats.SampleSum += Sample; + FuncStats.MaxSample = std::max(FuncStats.MaxSample, Sample); + if (Sample >= HotThreshold) + ++FuncStats.HotBlockCount; + } + + for (const auto &C : Func.getCallsiteSamples()) { + for (const auto &F : C.second) + getFuncSampleStats(F.second, FuncStats, HotThreshold); + } +} + +/// Predicate that determines if a function is hot with a given threshold. We +/// keep it separate from its callsites for possible extension in the future. +static bool isFunctionHot(const FuncSampleStats &FuncStats, + uint64_t HotThreshold) { + // We intentionally compare the maximum sample count in a function with the + // HotThreshold to get an approximate determination on hot functions. + return (FuncStats.MaxSample >= HotThreshold); +} + +namespace { +class SampleOverlapAggregator { +public: + SampleOverlapAggregator(const std::string &BaseFilename, + const std::string &TestFilename, + double LowSimilarityThreshold, double Epsilon, + const OverlapFuncFilters &FuncFilter) + : BaseFilename(BaseFilename), TestFilename(TestFilename), + LowSimilarityThreshold(LowSimilarityThreshold), Epsilon(Epsilon), + FuncFilter(FuncFilter) {} + + /// Detect 0-sample input profile and report to output stream. This interface + /// should be called after loadProfiles(). + bool detectZeroSampleProfile(raw_fd_ostream &OS) const; + + /// Write out function-level similarity statistics for functions specified by + /// options --function, --value-cutoff, and --similarity-cutoff. + void dumpFuncSimilarity(raw_fd_ostream &OS) const; + + /// Write out program-level similarity and overlap statistics. + void dumpProgramSummary(raw_fd_ostream &OS) const; + + /// Write out hot-function and hot-block statistics for base_profile, + /// test_profile, and their overlap. For both cases, the overlap HO is + /// calculated as follows: + /// Given the number of functions (or blocks) that are hot in both profiles + /// HCommon and the number of functions (or blocks) that are hot in at + /// least one profile HUnion, HO = HCommon / HUnion. + void dumpHotFuncAndBlockOverlap(raw_fd_ostream &OS) const; + + /// This function tries matching functions in base and test profiles. For each + /// pair of matched functions, it aggregates the function-level + /// similarity into a profile-level similarity. It also dump function-level + /// similarity information of functions specified by --function, + /// --value-cutoff, and --similarity-cutoff options. The program-level + /// similarity PS is computed as follows: + /// Given function-level similarity FS(A) for all function A, the + /// weight of function A in base profile WB(A), and the weight of function + /// A in test profile WT(A), compute PS(base_profile, test_profile) = + /// sum_A(FS(A) * avg(WB(A), WT(A))) ranging in [0.0f to 1.0f] with 0.0 + /// meaning no-overlap. + void computeSampleProfileOverlap(raw_fd_ostream &OS); + + /// Initialize ProfOverlap with the sum of samples in base and test + /// profiles. This function also computes and keeps the sum of samples and + /// max sample counts of each function in BaseStats and TestStats for later + /// use to avoid re-computations. + void initializeSampleProfileOverlap(); + + /// Load profiles specified by BaseFilename and TestFilename. + std::error_code loadProfiles(); + + using FuncSampleStatsMap = + std::unordered_map; + +private: + SampleOverlapStats ProfOverlap; + SampleOverlapStats HotFuncOverlap; + SampleOverlapStats HotBlockOverlap; + std::string BaseFilename; + std::string TestFilename; + std::unique_ptr BaseReader; + std::unique_ptr TestReader; + // BaseStats and TestStats hold FuncSampleStats for each function, with + // function name as the key. + FuncSampleStatsMap BaseStats; + FuncSampleStatsMap TestStats; + // Low similarity threshold in floating point number + double LowSimilarityThreshold; + // Block samples above BaseHotThreshold or TestHotThreshold are considered hot + // for tracking hot blocks. + uint64_t BaseHotThreshold; + uint64_t TestHotThreshold; + // A small threshold used to round the results of floating point accumulations + // to resolve imprecision. + const double Epsilon; + std::multimap> + FuncSimilarityDump; + // FuncFilter carries specifications in options --value-cutoff and + // --function. + OverlapFuncFilters FuncFilter; + // Column offsets for printing the function-level details table. + static const unsigned int TestWeightCol = 15; + static const unsigned int SimilarityCol = 30; + static const unsigned int OverlapCol = 43; + static const unsigned int BaseUniqueCol = 53; + static const unsigned int TestUniqueCol = 67; + static const unsigned int BaseSampleCol = 81; + static const unsigned int TestSampleCol = 96; + static const unsigned int FuncNameCol = 111; + + /// Return a similarity of two line/block sample counters in the same + /// function in base and test profiles. The line/block-similarity BS(i) is + /// computed as follows: + /// For an offsets i, given the sample count at i in base profile BB(i), + /// the sample count at i in test profile BT(i), the sum of sample counts + /// in this function in base profile SB, and the sum of sample counts in + /// this function in test profile ST, compute BS(i) = 1.0 - fabs(BB(i)/SB - + /// BT(i)/ST), ranging in [0.0f to 1.0f] with 0.0 meaning no-overlap. + double computeBlockSimilarity(uint64_t BaseSample, uint64_t TestSample, + const SampleOverlapStats &FuncOverlap) const; + + void updateHotBlockOverlap(uint64_t BaseSample, uint64_t TestSample, + uint64_t HotBlockCount); + + void getHotFunctions(const FuncSampleStatsMap &ProfStats, + FuncSampleStatsMap &HotFunc, + uint64_t HotThreshold) const; + + void computeHotFuncOverlap(); + + /// This function updates statistics in FuncOverlap, HotBlockOverlap, and + /// Difference for two sample units in a matched function according to the + /// given match status. + void updateOverlapStatsForFunction(uint64_t BaseSample, uint64_t TestSample, + uint64_t HotBlockCount, + SampleOverlapStats &FuncOverlap, + double &Difference, MatchStatus Status); + + /// This function updates statistics in FuncOverlap, HotBlockOverlap, and + /// Difference for unmatched callees that only present in one profile in a + /// matched caller function. + void updateForUnmatchedCallee(const sampleprof::FunctionSamples &Func, + SampleOverlapStats &FuncOverlap, + double &Difference, MatchStatus Status); + + /// This function updates sample overlap statistics of an overlap function in + /// base and test profile. It also calculates a function-internal similarity + /// FIS as follows: + /// For offsets i that have samples in at least one profile in this + /// function A, given BS(i) returned by computeBlockSimilarity(), compute + /// FIS(A) = (2.0 - sum_i(1.0 - BS(i))) / 2, ranging in [0.0f to 1.0f] with + /// 0.0 meaning no overlap. + double computeSampleFunctionInternalOverlap( + const sampleprof::FunctionSamples &BaseFunc, + const sampleprof::FunctionSamples &TestFunc, + SampleOverlapStats &FuncOverlap); + + /// Function-level similarity (FS) is a weighted value over function internal + /// similarity (FIS). This function computes a function's FS from its FIS by + /// applying the weight. + double weightForFuncSimilarity(double FuncSimilarity, uint64_t BaseFuncSample, + uint64_t TestFuncSample) const; + + /// The function-level similarity FS(A) for a function A is computed as + /// follows: + /// Compute a function-internal similarity FIS(A) by + /// computeSampleFunctionInternalOverlap(). Then, with the weight of + /// function A in base profile WB(A), and the weight of function A in test + /// profile WT(A), compute FS(A) = FIS(A) * (1.0 - fabs(WB(A) - WT(A))) + /// ranging in [0.0f to 1.0f] with 0.0 meaning no overlap. + double + computeSampleFunctionOverlap(const sampleprof::FunctionSamples *BaseFunc, + const sampleprof::FunctionSamples *TestFunc, + SampleOverlapStats *FuncOverlap, + uint64_t BaseFuncSample, + uint64_t TestFuncSample); + + /// Profile-level similarity (PS) is a weighted aggregate over function-level + /// similarities (FS). This method weights the FS value by the function + /// weights in the base and test profiles for the aggregation. + double weightByImportance(double FuncSimilarity, uint64_t BaseFuncSample, + uint64_t TestFuncSample) const; +}; +} // end anonymous namespace + +bool SampleOverlapAggregator::detectZeroSampleProfile( + raw_fd_ostream &OS) const { + bool HaveZeroSample = false; + if (ProfOverlap.BaseSample == 0) { + OS << "Sum of sample counts for profile " << BaseFilename << " is 0.\n"; + HaveZeroSample = true; + } + if (ProfOverlap.TestSample == 0) { + OS << "Sum of sample counts for profile " << TestFilename << " is 0.\n"; + HaveZeroSample = true; + } + return HaveZeroSample; +} + +double SampleOverlapAggregator::computeBlockSimilarity( + uint64_t BaseSample, uint64_t TestSample, + const SampleOverlapStats &FuncOverlap) const { + double BaseFrac = 0.0; + double TestFrac = 0.0; + if (FuncOverlap.BaseSample > 0) + BaseFrac = static_cast(BaseSample) / FuncOverlap.BaseSample; + if (FuncOverlap.TestSample > 0) + TestFrac = static_cast(TestSample) / FuncOverlap.TestSample; + return 1.0 - std::fabs(BaseFrac - TestFrac); +} + +void SampleOverlapAggregator::updateHotBlockOverlap(uint64_t BaseSample, + uint64_t TestSample, + uint64_t HotBlockCount) { + bool IsBaseHot = (BaseSample >= BaseHotThreshold); + bool IsTestHot = (TestSample >= TestHotThreshold); + if (!IsBaseHot && !IsTestHot) + return; + + HotBlockOverlap.UnionCount += HotBlockCount; + if (IsBaseHot) + HotBlockOverlap.BaseCount += HotBlockCount; + if (IsTestHot) + HotBlockOverlap.TestCount += HotBlockCount; + if (IsBaseHot && IsTestHot) + HotBlockOverlap.OverlapCount += HotBlockCount; +} + +void SampleOverlapAggregator::getHotFunctions( + const FuncSampleStatsMap &ProfStats, FuncSampleStatsMap &HotFunc, + uint64_t HotThreshold) const { + for (const auto &F : ProfStats) { + if (isFunctionHot(F.second, HotThreshold)) + HotFunc.emplace(F.first, F.second); + } +} + +void SampleOverlapAggregator::computeHotFuncOverlap() { + FuncSampleStatsMap BaseHotFunc; + getHotFunctions(BaseStats, BaseHotFunc, BaseHotThreshold); + HotFuncOverlap.BaseCount = BaseHotFunc.size(); + + FuncSampleStatsMap TestHotFunc; + getHotFunctions(TestStats, TestHotFunc, TestHotThreshold); + HotFuncOverlap.TestCount = TestHotFunc.size(); + HotFuncOverlap.UnionCount = HotFuncOverlap.TestCount; + + for (const auto &F : BaseHotFunc) { + if (TestHotFunc.count(F.first)) + ++HotFuncOverlap.OverlapCount; + else + ++HotFuncOverlap.UnionCount; + } +} + +void SampleOverlapAggregator::updateOverlapStatsForFunction( + uint64_t BaseSample, uint64_t TestSample, uint64_t HotBlockCount, + SampleOverlapStats &FuncOverlap, double &Difference, MatchStatus Status) { + assert(Status != MS_None && + "Match status should be updated before updating overlap statistics"); + if (Status == MS_FirstUnique) { + TestSample = 0; + FuncOverlap.BaseUniqueSample += BaseSample; + } else if (Status == MS_SecondUnique) { + BaseSample = 0; + FuncOverlap.TestUniqueSample += TestSample; + } else { + ++FuncOverlap.OverlapCount; + } + + FuncOverlap.UnionSample += std::max(BaseSample, TestSample); + FuncOverlap.OverlapSample += std::min(BaseSample, TestSample); + Difference += + 1.0 - computeBlockSimilarity(BaseSample, TestSample, FuncOverlap); + updateHotBlockOverlap(BaseSample, TestSample, HotBlockCount); +} + +void SampleOverlapAggregator::updateForUnmatchedCallee( + const sampleprof::FunctionSamples &Func, SampleOverlapStats &FuncOverlap, + double &Difference, MatchStatus Status) { + assert((Status == MS_FirstUnique || Status == MS_SecondUnique) && + "Status must be either of the two unmatched cases"); + FuncSampleStats FuncStats; + if (Status == MS_FirstUnique) { + getFuncSampleStats(Func, FuncStats, BaseHotThreshold); + updateOverlapStatsForFunction(FuncStats.SampleSum, 0, + FuncStats.HotBlockCount, FuncOverlap, + Difference, Status); + } else { + getFuncSampleStats(Func, FuncStats, TestHotThreshold); + updateOverlapStatsForFunction(0, FuncStats.SampleSum, + FuncStats.HotBlockCount, FuncOverlap, + Difference, Status); + } +} + +double SampleOverlapAggregator::computeSampleFunctionInternalOverlap( + const sampleprof::FunctionSamples &BaseFunc, + const sampleprof::FunctionSamples &TestFunc, + SampleOverlapStats &FuncOverlap) { + + using namespace sampleprof; + + double Difference = 0; + + // Accumulate Difference for regular line/block samples in the function. + // We match them through sort-merge join algorithm because + // FunctionSamples::getBodySamples() returns a map of sample counters ordered + // by their offsets. + MatchStep BlockIterStep( + BaseFunc.getBodySamples().cbegin(), BaseFunc.getBodySamples().cend(), + TestFunc.getBodySamples().cbegin(), TestFunc.getBodySamples().cend()); + BlockIterStep.updateOneStep(); + while (!BlockIterStep.areBothFinished()) { + uint64_t BaseSample = + BlockIterStep.isFirstFinished() + ? 0 + : BlockIterStep.getFirstIter()->second.getSamples(); + uint64_t TestSample = + BlockIterStep.isSecondFinished() + ? 0 + : BlockIterStep.getSecondIter()->second.getSamples(); + updateOverlapStatsForFunction(BaseSample, TestSample, 1, FuncOverlap, + Difference, BlockIterStep.getMatchStatus()); + + BlockIterStep.updateOneStep(); + } + + // Accumulate Difference for callsite lines in the function. We match + // them through sort-merge algorithm because + // FunctionSamples::getCallsiteSamples() returns a map of callsite records + // ordered by their offsets. + MatchStep CallsiteIterStep( + BaseFunc.getCallsiteSamples().cbegin(), + BaseFunc.getCallsiteSamples().cend(), + TestFunc.getCallsiteSamples().cbegin(), + TestFunc.getCallsiteSamples().cend()); + CallsiteIterStep.updateOneStep(); + while (!CallsiteIterStep.areBothFinished()) { + MatchStatus CallsiteStepStatus = CallsiteIterStep.getMatchStatus(); + assert(CallsiteStepStatus != MS_None && + "Match status should be updated before entering loop body"); + + if (CallsiteStepStatus != MS_Match) { + auto Callsite = (CallsiteStepStatus == MS_FirstUnique) + ? CallsiteIterStep.getFirstIter() + : CallsiteIterStep.getSecondIter(); + for (const auto &F : Callsite->second) + updateForUnmatchedCallee(F.second, FuncOverlap, Difference, + CallsiteStepStatus); + } else { + // There may be multiple inlinees at the same offset, so we need to try + // matching all of them. This match is implemented through sort-merge + // algorithm because callsite records at the same offset are ordered by + // function names. + MatchStep CalleeIterStep( + CallsiteIterStep.getFirstIter()->second.cbegin(), + CallsiteIterStep.getFirstIter()->second.cend(), + CallsiteIterStep.getSecondIter()->second.cbegin(), + CallsiteIterStep.getSecondIter()->second.cend()); + CalleeIterStep.updateOneStep(); + while (!CalleeIterStep.areBothFinished()) { + MatchStatus CalleeStepStatus = CalleeIterStep.getMatchStatus(); + if (CalleeStepStatus != MS_Match) { + auto Callee = (CalleeStepStatus == MS_FirstUnique) + ? CalleeIterStep.getFirstIter() + : CalleeIterStep.getSecondIter(); + updateForUnmatchedCallee(Callee->second, FuncOverlap, Difference, + CalleeStepStatus); + } else { + // An inlined function can contain other inlinees inside, so compute + // the Difference recursively. + Difference += 2.0 - 2 * computeSampleFunctionInternalOverlap( + CalleeIterStep.getFirstIter()->second, + CalleeIterStep.getSecondIter()->second, + FuncOverlap); + } + CalleeIterStep.updateOneStep(); + } + } + CallsiteIterStep.updateOneStep(); + } + + // Difference reflects the total differences of line/block samples in this + // function and ranges in [0.0f to 2.0f]. Take (2.0 - Difference) / 2 to + // reflect the similarity between function profiles in [0.0f to 1.0f]. + return (2.0 - Difference) / 2; +} + +double SampleOverlapAggregator::weightForFuncSimilarity( + double FuncInternalSimilarity, uint64_t BaseFuncSample, + uint64_t TestFuncSample) const { + // Compute the weight as the distance between the function weights in two + // profiles. + double BaseFrac = 0.0; + double TestFrac = 0.0; + assert(ProfOverlap.BaseSample > 0 && + "Total samples in base profile should be greater than 0"); + BaseFrac = static_cast(BaseFuncSample) / ProfOverlap.BaseSample; + assert(ProfOverlap.TestSample > 0 && + "Total samples in test profile should be greater than 0"); + TestFrac = static_cast(TestFuncSample) / ProfOverlap.TestSample; + double WeightDistance = std::fabs(BaseFrac - TestFrac); + + // Take WeightDistance into the similarity. + return FuncInternalSimilarity * (1 - WeightDistance); +} + +double +SampleOverlapAggregator::weightByImportance(double FuncSimilarity, + uint64_t BaseFuncSample, + uint64_t TestFuncSample) const { + + double BaseFrac = 0.0; + double TestFrac = 0.0; + assert(ProfOverlap.BaseSample > 0 && + "Total samples in base profile should be greater than 0"); + BaseFrac = static_cast(BaseFuncSample) / ProfOverlap.BaseSample / 2.0; + assert(ProfOverlap.TestSample > 0 && + "Total samples in test profile should be greater than 0"); + TestFrac = static_cast(TestFuncSample) / ProfOverlap.TestSample / 2.0; + return FuncSimilarity * (BaseFrac + TestFrac); +} + +double SampleOverlapAggregator::computeSampleFunctionOverlap( + const sampleprof::FunctionSamples *BaseFunc, + const sampleprof::FunctionSamples *TestFunc, + SampleOverlapStats *FuncOverlap, uint64_t BaseFuncSample, + uint64_t TestFuncSample) { + // Default function internal similarity before weighted, meaning two functions + // has no overlap. + const double DefaultFuncInternalSimilarity = 0; + double FuncSimilarity; + double FuncInternalSimilarity; + + // If BaseFunc or TestFunc is nullptr, it means the functions do not overlap. + // In this case, we use DefaultFuncInternalSimilarity as the function internal + // similarity. + if (!BaseFunc || !TestFunc) { + FuncInternalSimilarity = DefaultFuncInternalSimilarity; + } else { + assert(FuncOverlap != nullptr && + "FuncOverlap should be provided in this case"); + FuncInternalSimilarity = computeSampleFunctionInternalOverlap( + *BaseFunc, *TestFunc, *FuncOverlap); + // Now, FuncInternalSimilarity may be a little less than 0 due to + // imprecision of floating point accumulations. Make it zero if the + // difference is below Epsilon. + FuncInternalSimilarity = (std::fabs(FuncInternalSimilarity - 0) < Epsilon) + ? 0 + : FuncInternalSimilarity; + } + FuncSimilarity = weightForFuncSimilarity(FuncInternalSimilarity, + BaseFuncSample, TestFuncSample); + return FuncSimilarity; +} + +void SampleOverlapAggregator::computeSampleProfileOverlap(raw_fd_ostream &OS) { + using namespace sampleprof; + + std::unordered_map + BaseFuncProf; + const auto &BaseProfiles = BaseReader->getProfiles(); + for (const auto &BaseFunc : BaseProfiles) { + BaseFuncProf.emplace(BaseFunc.second.getContext(), &(BaseFunc.second)); + } + ProfOverlap.UnionCount = BaseFuncProf.size(); + + const auto &TestProfiles = TestReader->getProfiles(); + for (const auto &TestFunc : TestProfiles) { + SampleOverlapStats FuncOverlap; + FuncOverlap.TestName = TestFunc.second.getContext(); + assert(TestStats.count(FuncOverlap.TestName) && + "TestStats should have records for all functions in test profile " + "except inlinees"); + FuncOverlap.TestSample = TestStats[FuncOverlap.TestName].SampleSum; + + bool Matched = false; + const auto Match = BaseFuncProf.find(FuncOverlap.TestName); + if (Match == BaseFuncProf.end()) { + const FuncSampleStats &FuncStats = TestStats[FuncOverlap.TestName]; + ++ProfOverlap.TestUniqueCount; + ProfOverlap.TestUniqueSample += FuncStats.SampleSum; + FuncOverlap.TestUniqueSample = FuncStats.SampleSum; + + updateHotBlockOverlap(0, FuncStats.SampleSum, FuncStats.HotBlockCount); + + double FuncSimilarity = computeSampleFunctionOverlap( + nullptr, nullptr, nullptr, 0, FuncStats.SampleSum); + ProfOverlap.Similarity += + weightByImportance(FuncSimilarity, 0, FuncStats.SampleSum); + + ++ProfOverlap.UnionCount; + ProfOverlap.UnionSample += FuncStats.SampleSum; + } else { + ++ProfOverlap.OverlapCount; + + // Two functions match with each other. Compute function-level overlap and + // aggregate them into profile-level overlap. + FuncOverlap.BaseName = Match->second->getContext(); + assert(BaseStats.count(FuncOverlap.BaseName) && + "BaseStats should have records for all functions in base profile " + "except inlinees"); + FuncOverlap.BaseSample = BaseStats[FuncOverlap.BaseName].SampleSum; + + FuncOverlap.Similarity = computeSampleFunctionOverlap( + Match->second, &TestFunc.second, &FuncOverlap, FuncOverlap.BaseSample, + FuncOverlap.TestSample); + ProfOverlap.Similarity += + weightByImportance(FuncOverlap.Similarity, FuncOverlap.BaseSample, + FuncOverlap.TestSample); + ProfOverlap.OverlapSample += FuncOverlap.OverlapSample; + ProfOverlap.UnionSample += FuncOverlap.UnionSample; + + // Accumulate the percentage of base unique and test unique samples into + // ProfOverlap. + ProfOverlap.BaseUniqueSample += FuncOverlap.BaseUniqueSample; + ProfOverlap.TestUniqueSample += FuncOverlap.TestUniqueSample; + + // Remove matched base functions for later reporting functions not found + // in test profile. + BaseFuncProf.erase(Match); + Matched = true; + } + + // Print function-level similarity information if specified by options. + assert(TestStats.count(FuncOverlap.TestName) && + "TestStats should have records for all functions in test profile " + "except inlinees"); + if (TestStats[FuncOverlap.TestName].MaxSample >= FuncFilter.ValueCutoff || + (Matched && FuncOverlap.Similarity < LowSimilarityThreshold) || + (Matched && !FuncFilter.NameFilter.empty() && + FuncOverlap.BaseName.toString().find(FuncFilter.NameFilter) != + std::string::npos)) { + assert(ProfOverlap.BaseSample > 0 && + "Total samples in base profile should be greater than 0"); + FuncOverlap.BaseWeight = + static_cast(FuncOverlap.BaseSample) / ProfOverlap.BaseSample; + assert(ProfOverlap.TestSample > 0 && + "Total samples in test profile should be greater than 0"); + FuncOverlap.TestWeight = + static_cast(FuncOverlap.TestSample) / ProfOverlap.TestSample; + FuncSimilarityDump.emplace(FuncOverlap.BaseWeight, FuncOverlap); + } + } + + // Traverse through functions in base profile but not in test profile. + for (const auto &F : BaseFuncProf) { + assert(BaseStats.count(F.second->getContext()) && + "BaseStats should have records for all functions in base profile " + "except inlinees"); + const FuncSampleStats &FuncStats = BaseStats[F.second->getContext()]; + ++ProfOverlap.BaseUniqueCount; + ProfOverlap.BaseUniqueSample += FuncStats.SampleSum; + + updateHotBlockOverlap(FuncStats.SampleSum, 0, FuncStats.HotBlockCount); + + double FuncSimilarity = computeSampleFunctionOverlap( + nullptr, nullptr, nullptr, FuncStats.SampleSum, 0); + ProfOverlap.Similarity += + weightByImportance(FuncSimilarity, FuncStats.SampleSum, 0); + + ProfOverlap.UnionSample += FuncStats.SampleSum; + } + + // Now, ProfSimilarity may be a little greater than 1 due to imprecision + // of floating point accumulations. Make it 1.0 if the difference is below + // Epsilon. + ProfOverlap.Similarity = (std::fabs(ProfOverlap.Similarity - 1) < Epsilon) + ? 1 + : ProfOverlap.Similarity; + + computeHotFuncOverlap(); +} + +void SampleOverlapAggregator::initializeSampleProfileOverlap() { + const auto &BaseProf = BaseReader->getProfiles(); + for (const auto &I : BaseProf) { + ++ProfOverlap.BaseCount; + FuncSampleStats FuncStats; + getFuncSampleStats(I.second, FuncStats, BaseHotThreshold); + ProfOverlap.BaseSample += FuncStats.SampleSum; + BaseStats.emplace(I.second.getContext(), FuncStats); + } + + const auto &TestProf = TestReader->getProfiles(); + for (const auto &I : TestProf) { + ++ProfOverlap.TestCount; + FuncSampleStats FuncStats; + getFuncSampleStats(I.second, FuncStats, TestHotThreshold); + ProfOverlap.TestSample += FuncStats.SampleSum; + TestStats.emplace(I.second.getContext(), FuncStats); + } + + ProfOverlap.BaseName = StringRef(BaseFilename); + ProfOverlap.TestName = StringRef(TestFilename); +} + +void SampleOverlapAggregator::dumpFuncSimilarity(raw_fd_ostream &OS) const { + using namespace sampleprof; + + if (FuncSimilarityDump.empty()) + return; + + formatted_raw_ostream FOS(OS); + FOS << "Function-level details:\n"; + FOS << "Base weight"; + FOS.PadToColumn(TestWeightCol); + FOS << "Test weight"; + FOS.PadToColumn(SimilarityCol); + FOS << "Similarity"; + FOS.PadToColumn(OverlapCol); + FOS << "Overlap"; + FOS.PadToColumn(BaseUniqueCol); + FOS << "Base unique"; + FOS.PadToColumn(TestUniqueCol); + FOS << "Test unique"; + FOS.PadToColumn(BaseSampleCol); + FOS << "Base samples"; + FOS.PadToColumn(TestSampleCol); + FOS << "Test samples"; + FOS.PadToColumn(FuncNameCol); + FOS << "Function name\n"; + for (const auto &F : FuncSimilarityDump) { + double OverlapPercent = + F.second.UnionSample > 0 + ? static_cast(F.second.OverlapSample) / F.second.UnionSample + : 0; + double BaseUniquePercent = + F.second.BaseSample > 0 + ? static_cast(F.second.BaseUniqueSample) / + F.second.BaseSample + : 0; + double TestUniquePercent = + F.second.TestSample > 0 + ? static_cast(F.second.TestUniqueSample) / + F.second.TestSample + : 0; + + FOS << format("%.2f%%", F.second.BaseWeight * 100); + FOS.PadToColumn(TestWeightCol); + FOS << format("%.2f%%", F.second.TestWeight * 100); + FOS.PadToColumn(SimilarityCol); + FOS << format("%.2f%%", F.second.Similarity * 100); + FOS.PadToColumn(OverlapCol); + FOS << format("%.2f%%", OverlapPercent * 100); + FOS.PadToColumn(BaseUniqueCol); + FOS << format("%.2f%%", BaseUniquePercent * 100); + FOS.PadToColumn(TestUniqueCol); + FOS << format("%.2f%%", TestUniquePercent * 100); + FOS.PadToColumn(BaseSampleCol); + FOS << F.second.BaseSample; + FOS.PadToColumn(TestSampleCol); + FOS << F.second.TestSample; + FOS.PadToColumn(FuncNameCol); + FOS << F.second.TestName.toString() << "\n"; + } +} + +void SampleOverlapAggregator::dumpProgramSummary(raw_fd_ostream &OS) const { + OS << "Profile overlap infomation for base_profile: " + << ProfOverlap.BaseName.toString() + << " and test_profile: " << ProfOverlap.TestName.toString() + << "\nProgram level:\n"; + + OS << " Whole program profile similarity: " + << format("%.3f%%", ProfOverlap.Similarity * 100) << "\n"; + + assert(ProfOverlap.UnionSample > 0 && + "Total samples in two profile should be greater than 0"); + double OverlapPercent = + static_cast(ProfOverlap.OverlapSample) / ProfOverlap.UnionSample; + assert(ProfOverlap.BaseSample > 0 && + "Total samples in base profile should be greater than 0"); + double BaseUniquePercent = static_cast(ProfOverlap.BaseUniqueSample) / + ProfOverlap.BaseSample; + assert(ProfOverlap.TestSample > 0 && + "Total samples in test profile should be greater than 0"); + double TestUniquePercent = static_cast(ProfOverlap.TestUniqueSample) / + ProfOverlap.TestSample; + + OS << " Whole program sample overlap: " + << format("%.3f%%", OverlapPercent * 100) << "\n"; + OS << " percentage of samples unique in base profile: " + << format("%.3f%%", BaseUniquePercent * 100) << "\n"; + OS << " percentage of samples unique in test profile: " + << format("%.3f%%", TestUniquePercent * 100) << "\n"; + OS << " total samples in base profile: " << ProfOverlap.BaseSample << "\n" + << " total samples in test profile: " << ProfOverlap.TestSample << "\n"; + + assert(ProfOverlap.UnionCount > 0 && + "There should be at least one function in two input profiles"); + double FuncOverlapPercent = + static_cast(ProfOverlap.OverlapCount) / ProfOverlap.UnionCount; + OS << " Function overlap: " << format("%.3f%%", FuncOverlapPercent * 100) + << "\n"; + OS << " overlap functions: " << ProfOverlap.OverlapCount << "\n"; + OS << " functions unique in base profile: " << ProfOverlap.BaseUniqueCount + << "\n"; + OS << " functions unique in test profile: " << ProfOverlap.TestUniqueCount + << "\n"; +} + +void SampleOverlapAggregator::dumpHotFuncAndBlockOverlap( + raw_fd_ostream &OS) const { + assert(HotFuncOverlap.UnionCount > 0 && + "There should be at least one hot function in two input profiles"); + OS << " Hot-function overlap: " + << format("%.3f%%", static_cast(HotFuncOverlap.OverlapCount) / + HotFuncOverlap.UnionCount * 100) + << "\n"; + OS << " overlap hot functions: " << HotFuncOverlap.OverlapCount << "\n"; + OS << " hot functions unique in base profile: " + << HotFuncOverlap.BaseCount - HotFuncOverlap.OverlapCount << "\n"; + OS << " hot functions unique in test profile: " + << HotFuncOverlap.TestCount - HotFuncOverlap.OverlapCount << "\n"; + + assert(HotBlockOverlap.UnionCount > 0 && + "There should be at least one hot block in two input profiles"); + OS << " Hot-block overlap: " + << format("%.3f%%", static_cast(HotBlockOverlap.OverlapCount) / + HotBlockOverlap.UnionCount * 100) + << "\n"; + OS << " overlap hot blocks: " << HotBlockOverlap.OverlapCount << "\n"; + OS << " hot blocks unique in base profile: " + << HotBlockOverlap.BaseCount - HotBlockOverlap.OverlapCount << "\n"; + OS << " hot blocks unique in test profile: " + << HotBlockOverlap.TestCount - HotBlockOverlap.OverlapCount << "\n"; +} + +std::error_code SampleOverlapAggregator::loadProfiles() { + using namespace sampleprof; + + LLVMContext Context; + auto FS = vfs::getRealFileSystem(); + auto BaseReaderOrErr = SampleProfileReader::create(BaseFilename, Context, *FS, + FSDiscriminatorPassOption); + if (std::error_code EC = BaseReaderOrErr.getError()) + exitWithErrorCode(EC, BaseFilename); + + auto TestReaderOrErr = SampleProfileReader::create(TestFilename, Context, *FS, + FSDiscriminatorPassOption); + if (std::error_code EC = TestReaderOrErr.getError()) + exitWithErrorCode(EC, TestFilename); + + BaseReader = std::move(BaseReaderOrErr.get()); + TestReader = std::move(TestReaderOrErr.get()); + + if (std::error_code EC = BaseReader->read()) + exitWithErrorCode(EC, BaseFilename); + if (std::error_code EC = TestReader->read()) + exitWithErrorCode(EC, TestFilename); + if (BaseReader->profileIsProbeBased() != TestReader->profileIsProbeBased()) + exitWithError( + "cannot compare probe-based profile with non-probe-based profile"); + if (BaseReader->profileIsCS() != TestReader->profileIsCS()) + exitWithError("cannot compare CS profile with non-CS profile"); + + // Load BaseHotThreshold and TestHotThreshold as 99-percentile threshold in + // profile summary. + ProfileSummary &BasePS = BaseReader->getSummary(); + ProfileSummary &TestPS = TestReader->getSummary(); + BaseHotThreshold = + ProfileSummaryBuilder::getHotCountThreshold(BasePS.getDetailedSummary()); + TestHotThreshold = + ProfileSummaryBuilder::getHotCountThreshold(TestPS.getDetailedSummary()); + + return std::error_code(); +} + +void overlapSampleProfile(const std::string &BaseFilename, + const std::string &TestFilename, + const OverlapFuncFilters &FuncFilter, + uint64_t SimilarityCutoff, raw_fd_ostream &OS) { + using namespace sampleprof; + + // We use 0.000005 to initialize OverlapAggr.Epsilon because the final metrics + // report 2--3 places after decimal point in percentage numbers. + SampleOverlapAggregator OverlapAggr( + BaseFilename, TestFilename, + static_cast(SimilarityCutoff) / 1000000, 0.000005, FuncFilter); + if (std::error_code EC = OverlapAggr.loadProfiles()) + exitWithErrorCode(EC); + + OverlapAggr.initializeSampleProfileOverlap(); + if (OverlapAggr.detectZeroSampleProfile(OS)) + return; + + OverlapAggr.computeSampleProfileOverlap(OS); + + OverlapAggr.dumpProgramSummary(OS); + OverlapAggr.dumpHotFuncAndBlockOverlap(OS); + OverlapAggr.dumpFuncSimilarity(OS); +} + +static int overlap_main() { + std::error_code EC; + raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF); + if (EC) + exitWithErrorCode(EC, OutputFilename); + + if (ProfileKind == instr) + overlapInstrProfile(BaseFilename, TestFilename, + OverlapFuncFilters{OverlapValueCutoff, FuncNameFilter}, + OS, IsCS); + else + overlapSampleProfile(BaseFilename, TestFilename, + OverlapFuncFilters{OverlapValueCutoff, FuncNameFilter}, + SimilarityCutoff, OS); + + return 0; +} + +namespace { +struct ValueSitesStats { + ValueSitesStats() = default; + uint64_t TotalNumValueSites = 0; + uint64_t TotalNumValueSitesWithValueProfile = 0; + uint64_t TotalNumValues = 0; + std::vector ValueSitesHistogram; +}; +} // namespace + +static void traverseAllValueSites(const InstrProfRecord &Func, uint32_t VK, + ValueSitesStats &Stats, raw_fd_ostream &OS, + InstrProfSymtab *Symtab) { + uint32_t NS = Func.getNumValueSites(VK); + Stats.TotalNumValueSites += NS; + for (size_t I = 0; I < NS; ++I) { + auto VD = Func.getValueArrayForSite(VK, I); + uint32_t NV = VD.size(); + if (NV == 0) + continue; + Stats.TotalNumValues += NV; + Stats.TotalNumValueSitesWithValueProfile++; + if (NV > Stats.ValueSitesHistogram.size()) + Stats.ValueSitesHistogram.resize(NV, 0); + Stats.ValueSitesHistogram[NV - 1]++; + + uint64_t SiteSum = 0; + for (const auto &V : VD) + SiteSum += V.Count; + if (SiteSum == 0) + SiteSum = 1; + + for (const auto &V : VD) { + OS << "\t[ " << format("%2u", I) << ", "; + if (Symtab == nullptr) + OS << format("%4" PRIu64, V.Value); + else + OS << Symtab->getFuncOrVarName(V.Value); + OS << ", " << format("%10" PRId64, V.Count) << " ] (" + << format("%.2f%%", (V.Count * 100.0 / SiteSum)) << ")\n"; + } + } +} + +static void showValueSitesStats(raw_fd_ostream &OS, uint32_t VK, + ValueSitesStats &Stats) { + OS << " Total number of sites: " << Stats.TotalNumValueSites << "\n"; + OS << " Total number of sites with values: " + << Stats.TotalNumValueSitesWithValueProfile << "\n"; + OS << " Total number of profiled values: " << Stats.TotalNumValues << "\n"; + + OS << " Value sites histogram:\n\tNumTargets, SiteCount\n"; + for (unsigned I = 0; I < Stats.ValueSitesHistogram.size(); I++) { + if (Stats.ValueSitesHistogram[I] > 0) + OS << "\t" << I + 1 << ", " << Stats.ValueSitesHistogram[I] << "\n"; + } +} + +static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { + if (SFormat == ShowFormat::Json) + exitWithError("JSON output is not supported for instr profiles"); + if (SFormat == ShowFormat::Yaml) + exitWithError("YAML output is not supported for instr profiles"); + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = InstrProfReader::create(Filename, *FS); + std::vector Cutoffs = std::move(DetailedSummaryCutoffs); + if (ShowDetailedSummary && Cutoffs.empty()) { + Cutoffs = ProfileSummaryBuilder::DefaultCutoffs; + } + InstrProfSummaryBuilder Builder(std::move(Cutoffs)); + if (Error E = ReaderOrErr.takeError()) + exitWithError(std::move(E), Filename); + + auto Reader = std::move(ReaderOrErr.get()); + bool IsIRInstr = Reader->isIRLevelProfile(); + size_t ShownFunctions = 0; + size_t BelowCutoffFunctions = 0; + int NumVPKind = IPVK_Last - IPVK_First + 1; + std::vector VPStats(NumVPKind); + + auto MinCmp = [](const std::pair &v1, + const std::pair &v2) { + return v1.second > v2.second; + }; + + std::priority_queue, + std::vector>, + decltype(MinCmp)> + HottestFuncs(MinCmp); + + if (!TextFormat && OnlyListBelow) { + OS << "The list of functions with the maximum counter less than " + << ShowValueCutoff << ":\n"; + } + + // Add marker so that IR-level instrumentation round-trips properly. + if (TextFormat && IsIRInstr) + OS << ":ir\n"; + + for (const auto &Func : *Reader) { + if (Reader->isIRLevelProfile()) { + bool FuncIsCS = NamedInstrProfRecord::hasCSFlagInHash(Func.Hash); + if (FuncIsCS != ShowCS) + continue; + } + bool Show = ShowAllFunctions || + (!FuncNameFilter.empty() && Func.Name.contains(FuncNameFilter)); + + bool doTextFormatDump = (Show && TextFormat); + + if (doTextFormatDump) { + InstrProfSymtab &Symtab = Reader->getSymtab(); + InstrProfWriter::writeRecordInText(Func.Name, Func.Hash, Func, Symtab, + OS); + continue; + } + + assert(Func.Counts.size() > 0 && "function missing entry counter"); + Builder.addRecord(Func); + + if (ShowCovered) { + if (llvm::any_of(Func.Counts, [](uint64_t C) { return C; })) + OS << Func.Name << "\n"; + continue; + } + + uint64_t FuncMax = 0; + uint64_t FuncSum = 0; + + auto PseudoKind = Func.getCountPseudoKind(); + if (PseudoKind != InstrProfRecord::NotPseudo) { + if (Show) { + if (!ShownFunctions) + OS << "Counters:\n"; + ++ShownFunctions; + OS << " " << Func.Name << ":\n" + << " Hash: " << format("0x%016" PRIx64, Func.Hash) << "\n" + << " Counters: " << Func.Counts.size(); + if (PseudoKind == InstrProfRecord::PseudoHot) + OS << " \n"; + else if (PseudoKind == InstrProfRecord::PseudoWarm) + OS << " \n"; + else + llvm_unreachable("Unknown PseudoKind"); + } + continue; + } + + for (size_t I = 0, E = Func.Counts.size(); I < E; ++I) { + FuncMax = std::max(FuncMax, Func.Counts[I]); + FuncSum += Func.Counts[I]; + } + + if (FuncMax < ShowValueCutoff) { + ++BelowCutoffFunctions; + if (OnlyListBelow) { + OS << " " << Func.Name << ": (Max = " << FuncMax + << " Sum = " << FuncSum << ")\n"; + } + continue; + } else if (OnlyListBelow) + continue; + + if (TopNFunctions) { + if (HottestFuncs.size() == TopNFunctions) { + if (HottestFuncs.top().second < FuncMax) { + HottestFuncs.pop(); + HottestFuncs.emplace(std::make_pair(std::string(Func.Name), FuncMax)); + } + } else + HottestFuncs.emplace(std::make_pair(std::string(Func.Name), FuncMax)); + } + + if (Show) { + if (!ShownFunctions) + OS << "Counters:\n"; + + ++ShownFunctions; + + OS << " " << Func.Name << ":\n" + << " Hash: " << format("0x%016" PRIx64, Func.Hash) << "\n" + << " Counters: " << Func.Counts.size() << "\n"; + if (!IsIRInstr) + OS << " Function count: " << Func.Counts[0] << "\n"; + + if (ShowIndirectCallTargets) + OS << " Indirect Call Site Count: " + << Func.getNumValueSites(IPVK_IndirectCallTarget) << "\n"; + + if (ShowVTables) + OS << " Number of instrumented vtables: " + << Func.getNumValueSites(IPVK_VTableTarget) << "\n"; + + uint32_t NumMemOPCalls = Func.getNumValueSites(IPVK_MemOPSize); + if (ShowMemOPSizes && NumMemOPCalls > 0) + OS << " Number of Memory Intrinsics Calls: " << NumMemOPCalls + << "\n"; + + if (ShowCounts) { + OS << " Block counts: ["; + size_t Start = (IsIRInstr ? 0 : 1); + for (size_t I = Start, E = Func.Counts.size(); I < E; ++I) { + OS << (I == Start ? "" : ", ") << Func.Counts[I]; + } + OS << "]\n"; + } + + if (ShowIndirectCallTargets) { + OS << " Indirect Target Results:\n"; + traverseAllValueSites(Func, IPVK_IndirectCallTarget, + VPStats[IPVK_IndirectCallTarget], OS, + &(Reader->getSymtab())); + } + + if (ShowVTables) { + OS << " VTable Results:\n"; + traverseAllValueSites(Func, IPVK_VTableTarget, + VPStats[IPVK_VTableTarget], OS, + &(Reader->getSymtab())); + } + + if (ShowMemOPSizes && NumMemOPCalls > 0) { + OS << " Memory Intrinsic Size Results:\n"; + traverseAllValueSites(Func, IPVK_MemOPSize, VPStats[IPVK_MemOPSize], OS, + nullptr); + } + } + } + if (Reader->hasError()) + exitWithError(Reader->getError(), Filename); + + if (TextFormat || ShowCovered) + return 0; + std::unique_ptr PS(Builder.getSummary()); + bool IsIR = Reader->isIRLevelProfile(); + OS << "Instrumentation level: " << (IsIR ? "IR" : "Front-end"); + if (IsIR) + OS << " entry_first = " << Reader->instrEntryBBEnabled(); + OS << "\n"; + if (ShowAllFunctions || !FuncNameFilter.empty()) + OS << "Functions shown: " << ShownFunctions << "\n"; + OS << "Total functions: " << PS->getNumFunctions() << "\n"; + if (ShowValueCutoff > 0) { + OS << "Number of functions with maximum count (< " << ShowValueCutoff + << "): " << BelowCutoffFunctions << "\n"; + OS << "Number of functions with maximum count (>= " << ShowValueCutoff + << "): " << PS->getNumFunctions() - BelowCutoffFunctions << "\n"; + } + OS << "Maximum function count: " << PS->getMaxFunctionCount() << "\n"; + OS << "Maximum internal block count: " << PS->getMaxInternalCount() << "\n"; + + if (TopNFunctions) { + std::vector> SortedHottestFuncs; + while (!HottestFuncs.empty()) { + SortedHottestFuncs.emplace_back(HottestFuncs.top()); + HottestFuncs.pop(); + } + OS << "Top " << TopNFunctions + << " functions with the largest internal block counts: \n"; + for (auto &hotfunc : llvm::reverse(SortedHottestFuncs)) + OS << " " << hotfunc.first << ", max count = " << hotfunc.second << "\n"; + } + + if (ShownFunctions && ShowIndirectCallTargets) { + OS << "Statistics for indirect call sites profile:\n"; + showValueSitesStats(OS, IPVK_IndirectCallTarget, + VPStats[IPVK_IndirectCallTarget]); + } + + if (ShownFunctions && ShowVTables) { + OS << "Statistics for vtable profile:\n"; + showValueSitesStats(OS, IPVK_VTableTarget, VPStats[IPVK_VTableTarget]); + } + + if (ShownFunctions && ShowMemOPSizes) { + OS << "Statistics for memory intrinsic calls sizes profile:\n"; + showValueSitesStats(OS, IPVK_MemOPSize, VPStats[IPVK_MemOPSize]); + } + + if (ShowDetailedSummary) { + OS << "Total number of blocks: " << PS->getNumCounts() << "\n"; + OS << "Total count: " << PS->getTotalCount() << "\n"; + PS->printDetailedSummary(OS); + } + + if (ShowBinaryIds) + if (Error E = Reader->printBinaryIds(OS)) + exitWithError(std::move(E), Filename); + + if (ShowProfileVersion) + OS << "Profile version: " << Reader->getVersion() << "\n"; + + if (ShowTemporalProfTraces) { + auto &Traces = Reader->getTemporalProfTraces(); + OS << "Temporal Profile Traces (samples=" << Traces.size() + << " seen=" << Reader->getTemporalProfTraceStreamSize() << "):\n"; + for (unsigned i = 0; i < Traces.size(); i++) { + OS << " Temporal Profile Trace " << i << " (weight=" << Traces[i].Weight + << " count=" << Traces[i].FunctionNameRefs.size() << "):\n"; + for (auto &NameRef : Traces[i].FunctionNameRefs) + OS << " " << Reader->getSymtab().getFuncOrVarName(NameRef) << "\n"; + } + } + + return 0; +} + +static void showSectionInfo(sampleprof::SampleProfileReader *Reader, + raw_fd_ostream &OS) { + if (!Reader->dumpSectionInfo(OS)) { + WithColor::warning() << "-show-sec-info-only is only supported for " + << "sample profile in extbinary format and is " + << "ignored for other formats.\n"; + return; + } +} + +namespace { +struct HotFuncInfo { + std::string FuncName; + uint64_t TotalCount = 0; + double TotalCountPercent = 0.0f; + uint64_t MaxCount = 0; + uint64_t EntryCount = 0; + + HotFuncInfo() = default; + + HotFuncInfo(StringRef FN, uint64_t TS, double TSP, uint64_t MS, uint64_t ES) + : FuncName(FN.begin(), FN.end()), TotalCount(TS), TotalCountPercent(TSP), + MaxCount(MS), EntryCount(ES) {} +}; +} // namespace + +// Print out detailed information about hot functions in PrintValues vector. +// Users specify titles and offset of every columns through ColumnTitle and +// ColumnOffset. The size of ColumnTitle and ColumnOffset need to be the same +// and at least 4. Besides, users can optionally give a HotFuncMetric string to +// print out or let it be an empty string. +static void dumpHotFunctionList(const std::vector &ColumnTitle, + const std::vector &ColumnOffset, + const std::vector &PrintValues, + uint64_t HotFuncCount, uint64_t TotalFuncCount, + uint64_t HotProfCount, uint64_t TotalProfCount, + const std::string &HotFuncMetric, + uint32_t TopNFunctions, raw_fd_ostream &OS) { + assert(ColumnOffset.size() == ColumnTitle.size() && + "ColumnOffset and ColumnTitle should have the same size"); + assert(ColumnTitle.size() >= 4 && + "ColumnTitle should have at least 4 elements"); + assert(TotalFuncCount > 0 && + "There should be at least one function in the profile"); + double TotalProfPercent = 0; + if (TotalProfCount > 0) + TotalProfPercent = static_cast(HotProfCount) / TotalProfCount * 100; + + formatted_raw_ostream FOS(OS); + FOS << HotFuncCount << " out of " << TotalFuncCount + << " functions with profile (" + << format("%.2f%%", + (static_cast(HotFuncCount) / TotalFuncCount * 100)) + << ") are considered hot functions"; + if (!HotFuncMetric.empty()) + FOS << " (" << HotFuncMetric << ")"; + FOS << ".\n"; + FOS << HotProfCount << " out of " << TotalProfCount << " profile counts (" + << format("%.2f%%", TotalProfPercent) << ") are from hot functions.\n"; + + for (size_t I = 0; I < ColumnTitle.size(); ++I) { + FOS.PadToColumn(ColumnOffset[I]); + FOS << ColumnTitle[I]; + } + FOS << "\n"; + + uint32_t Count = 0; + for (const auto &R : PrintValues) { + if (TopNFunctions && (Count++ == TopNFunctions)) + break; + FOS.PadToColumn(ColumnOffset[0]); + FOS << R.TotalCount << " (" << format("%.2f%%", R.TotalCountPercent) << ")"; + FOS.PadToColumn(ColumnOffset[1]); + FOS << R.MaxCount; + FOS.PadToColumn(ColumnOffset[2]); + FOS << R.EntryCount; + FOS.PadToColumn(ColumnOffset[3]); + FOS << R.FuncName << "\n"; + } +} + +static int showHotFunctionList(const sampleprof::SampleProfileMap &Profiles, + ProfileSummary &PS, uint32_t TopN, + raw_fd_ostream &OS) { + using namespace sampleprof; + + const uint32_t HotFuncCutoff = 990000; + auto &SummaryVector = PS.getDetailedSummary(); + uint64_t MinCountThreshold = 0; + for (const ProfileSummaryEntry &SummaryEntry : SummaryVector) { + if (SummaryEntry.Cutoff == HotFuncCutoff) { + MinCountThreshold = SummaryEntry.MinCount; + break; + } + } + + // Traverse all functions in the profile and keep only hot functions. + // The following loop also calculates the sum of total samples of all + // functions. + std::multimap, + std::greater> + HotFunc; + uint64_t ProfileTotalSample = 0; + uint64_t HotFuncSample = 0; + uint64_t HotFuncCount = 0; + + for (const auto &I : Profiles) { + FuncSampleStats FuncStats; + const FunctionSamples &FuncProf = I.second; + ProfileTotalSample += FuncProf.getTotalSamples(); + getFuncSampleStats(FuncProf, FuncStats, MinCountThreshold); + + if (isFunctionHot(FuncStats, MinCountThreshold)) { + HotFunc.emplace(FuncProf.getTotalSamples(), + std::make_pair(&(I.second), FuncStats.MaxSample)); + HotFuncSample += FuncProf.getTotalSamples(); + ++HotFuncCount; + } + } + + std::vector ColumnTitle{"Total sample (%)", "Max sample", + "Entry sample", "Function name"}; + std::vector ColumnOffset{0, 24, 42, 58}; + std::string Metric = + std::string("max sample >= ") + std::to_string(MinCountThreshold); + std::vector PrintValues; + for (const auto &FuncPair : HotFunc) { + const FunctionSamples &Func = *FuncPair.second.first; + double TotalSamplePercent = + (ProfileTotalSample > 0) + ? (Func.getTotalSamples() * 100.0) / ProfileTotalSample + : 0; + PrintValues.emplace_back( + HotFuncInfo(Func.getContext().toString(), Func.getTotalSamples(), + TotalSamplePercent, FuncPair.second.second, + Func.getHeadSamplesEstimate())); + } + dumpHotFunctionList(ColumnTitle, ColumnOffset, PrintValues, HotFuncCount, + Profiles.size(), HotFuncSample, ProfileTotalSample, + Metric, TopN, OS); + + return 0; +} + +static int showSampleProfile(ShowFormat SFormat, raw_fd_ostream &OS) { + if (SFormat == ShowFormat::Yaml) + exitWithError("YAML output is not supported for sample profiles"); + using namespace sampleprof; + LLVMContext Context; + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = SampleProfileReader::create(Filename, Context, *FS, + FSDiscriminatorPassOption); + if (std::error_code EC = ReaderOrErr.getError()) + exitWithErrorCode(EC, Filename); + + auto Reader = std::move(ReaderOrErr.get()); + if (ShowSectionInfoOnly) { + showSectionInfo(Reader.get(), OS); + return 0; + } + + if (std::error_code EC = Reader->read()) + exitWithErrorCode(EC, Filename); + + if (ShowAllFunctions || FuncNameFilter.empty()) { + if (SFormat == ShowFormat::Json) + Reader->dumpJson(OS); + else + Reader->dump(OS); + } else { + if (SFormat == ShowFormat::Json) + exitWithError( + "the JSON format is supported only when all functions are to " + "be printed"); + + // TODO: parse context string to support filtering by contexts. + FunctionSamples *FS = Reader->getSamplesFor(StringRef(FuncNameFilter)); + Reader->dumpFunctionProfile(FS ? *FS : FunctionSamples(), OS); + } + + if (ShowProfileSymbolList) { + std::unique_ptr ReaderList = + Reader->getProfileSymbolList(); + ReaderList->dump(OS); + } + + if (ShowDetailedSummary) { + auto &PS = Reader->getSummary(); + PS.printSummary(OS); + PS.printDetailedSummary(OS); + } + + if (ShowHotFuncList || TopNFunctions) + showHotFunctionList(Reader->getProfiles(), Reader->getSummary(), + TopNFunctions, OS); + + return 0; +} + +static int showMemProfProfile(ShowFormat SFormat, raw_fd_ostream &OS) { + if (SFormat == ShowFormat::Json) + exitWithError("JSON output is not supported for MemProf"); + auto ReaderOr = llvm::memprof::RawMemProfReader::create( + Filename, ProfiledBinary, /*KeepNames=*/true); + if (Error E = ReaderOr.takeError()) + // Since the error can be related to the profile or the binary we do not + // pass whence. Instead additional context is provided where necessary in + // the error message. + exitWithError(std::move(E), /*Whence*/ ""); + + std::unique_ptr Reader( + ReaderOr.get().release()); + + Reader->printYAML(OS); + return 0; +} + +static int showDebugInfoCorrelation(const std::string &Filename, + ShowFormat SFormat, raw_fd_ostream &OS) { + if (SFormat == ShowFormat::Json) + exitWithError("JSON output is not supported for debug info correlation"); + std::unique_ptr Correlator; + if (auto Err = + InstrProfCorrelator::get(Filename, InstrProfCorrelator::DEBUG_INFO) + .moveInto(Correlator)) + exitWithError(std::move(Err), Filename); + if (SFormat == ShowFormat::Yaml) { + if (auto Err = Correlator->dumpYaml(MaxDbgCorrelationWarnings, OS)) + exitWithError(std::move(Err), Filename); + return 0; + } + + if (auto Err = Correlator->correlateProfileData(MaxDbgCorrelationWarnings)) + exitWithError(std::move(Err), Filename); + + InstrProfSymtab Symtab; + if (auto Err = Symtab.create( + StringRef(Correlator->getNamesPointer(), Correlator->getNamesSize()))) + exitWithError(std::move(Err), Filename); + + if (ShowProfileSymbolList) + Symtab.dumpNames(OS); + // TODO: Read "Profile Data Type" from debug info to compute and show how many + // counters the section holds. + if (ShowDetailedSummary) + OS << "Counters section size: 0x" + << Twine::utohexstr(Correlator->getCountersSectionSize()) << " bytes\n"; + OS << "Found " << Correlator->getDataSize() << " functions\n"; + + return 0; +} + +static int show_main(StringRef ProgName) { + if (Filename.empty() && DebugInfoFilename.empty()) + exitWithError( + "the positional argument '' is required unless '--" + + DebugInfoFilename.ArgStr + "' is provided"); + + if (Filename == OutputFilename) { + errs() << ProgName + << " show: Input file name cannot be the same as the output file " + "name!\n"; + return 1; + } + if (JsonFormat) + SFormat = ShowFormat::Json; + + std::error_code EC; + raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF); + if (EC) + exitWithErrorCode(EC, OutputFilename); + + if (ShowAllFunctions && !FuncNameFilter.empty()) + WithColor::warning() << "-function argument ignored: showing all functions\n"; + + if (!DebugInfoFilename.empty()) + return showDebugInfoCorrelation(DebugInfoFilename, SFormat, OS); + + if (ShowProfileKind == instr) + return showInstrProfile(SFormat, OS); + if (ShowProfileKind == sample) + return showSampleProfile(SFormat, OS); + return showMemProfProfile(SFormat, OS); +} + +static int order_main() { + std::error_code EC; + raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF); + if (EC) + exitWithErrorCode(EC, OutputFilename); + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = InstrProfReader::create(Filename, *FS); + if (Error E = ReaderOrErr.takeError()) + exitWithError(std::move(E), Filename); + + auto Reader = std::move(ReaderOrErr.get()); + for (auto &I : *Reader) { + // Read all entries + (void)I; + } + ArrayRef Traces = Reader->getTemporalProfTraces(); + if (NumTestTraces && NumTestTraces >= Traces.size()) + exitWithError( + "--" + NumTestTraces.ArgStr + + " must be smaller than the total number of traces: expected: < " + + Twine(Traces.size()) + ", actual: " + Twine(NumTestTraces)); + ArrayRef TestTraces = Traces.take_back(NumTestTraces); + Traces = Traces.drop_back(NumTestTraces); + + std::vector Nodes; + TemporalProfTraceTy::createBPFunctionNodes(Traces, Nodes); + BalancedPartitioningConfig Config; + BalancedPartitioning BP(Config); + BP.run(Nodes); + + OS << "# Ordered " << Nodes.size() << " functions\n"; + if (!TestTraces.empty()) { + // Since we don't know the symbol sizes, we assume 32 functions per page. + DenseMap IdToPageNumber; + for (auto &Node : Nodes) + IdToPageNumber[Node.Id] = IdToPageNumber.size() / 32; + + SmallSet TouchedPages; + unsigned Area = 0; + for (auto &Trace : TestTraces) { + for (auto Id : Trace.FunctionNameRefs) { + auto It = IdToPageNumber.find(Id); + if (It == IdToPageNumber.end()) + continue; + TouchedPages.insert(It->getSecond()); + Area += TouchedPages.size(); + } + TouchedPages.clear(); + } + OS << "# Total area under the page fault curve: " << (float)Area << "\n"; + } + OS << "# Warning: Mach-O may prefix symbols with \"_\" depending on the " + "linkage and this output does not take that into account. Some " + "post-processing may be required before passing to the linker via " + "-order_file.\n"; + for (auto &N : Nodes) { + auto [Filename, ParsedFuncName] = + getParsedIRPGOName(Reader->getSymtab().getFuncOrVarName(N.Id)); + if (!Filename.empty()) + OS << "# " << Filename << "\n"; + OS << ParsedFuncName << "\n"; + } + return 0; +} + +int llvm_profdata_main(int argc, char **argvNonConst, + const llvm::ToolContext &) { + const char **argv = const_cast(argvNonConst); + + StringRef ProgName(sys::path::filename(argv[0])); + + if (argc < 2) { + errs() << ProgName + << ": No subcommand specified! Run llvm-profata --help for usage.\n"; + return 1; + } + + cl::ParseCommandLineOptions(argc, argv, "LLVM profile data\n"); + + if (ShowSubcommand) + return show_main(ProgName); + + if (OrderSubcommand) + return order_main(); + + if (OverlapSubcommand) + return overlap_main(); + + if (MergeSubcommand) + return merge_main(ProgName); + + errs() << ProgName + << ": Unknown command. Run llvm-profdata --help for usage.\n"; + return 1; +} + +// LDC manually added `main` function, which is generated by CMake in LLVM's build. See LLVM's llvm-driver-template.cpp.in +#include "llvm/Support/InitLLVM.h" +int main(int argc, char **argv) { + llvm::InitLLVM X(argc, argv); + return llvm_profdata_main(argc, argv, {argv[0], nullptr, false}); +} diff --git a/tools/ldc-profgen/ldc-profgen-19.1/CMakeLists.txt b/tools/ldc-profgen/ldc-profgen-19.1/CMakeLists.txt new file mode 100644 index 00000000000..354c63f409f --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/CMakeLists.txt @@ -0,0 +1,25 @@ + +set(LLVM_LINK_COMPONENTS + AllTargetsDescs + AllTargetsDisassemblers + AllTargetsInfos + DebugInfoDWARF + Core + MC + IPO + MCDisassembler + Object + ProfileData + Support + Symbolize + TargetParser + ) + +add_llvm_tool(llvm-profgen + llvm-profgen.cpp + PerfReader.cpp + CSPreInliner.cpp + ProfiledBinary.cpp + ProfileGenerator.cpp + MissingFrameInferrer.cpp + ) diff --git a/tools/ldc-profgen/ldc-profgen-19.1/CSPreInliner.cpp b/tools/ldc-profgen/ldc-profgen-19.1/CSPreInliner.cpp new file mode 100644 index 00000000000..87df6996aa4 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/CSPreInliner.cpp @@ -0,0 +1,316 @@ +//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CSPreInliner.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Transforms/IPO/SampleProfile.h" +#include +#include + +#define DEBUG_TYPE "cs-preinliner" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(PreInlNumCSInlined, + "Number of functions inlined with context sensitive profile"); +STATISTIC(PreInlNumCSNotInlined, + "Number of functions not inlined with context sensitive profile"); +STATISTIC(PreInlNumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(PreInlNumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + PreInlNumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); + +// The switches specify inline thresholds used in SampleProfileLoader inlining. +// TODO: the actual threshold to be tuned here because the size here is based +// on machine code not LLVM IR. +namespace llvm { +cl::opt EnableCSPreInliner( + "csspgo-preinliner", cl::Hidden, cl::init(true), + cl::desc("Run a global pre-inliner to merge context profile based on " + "estimated global top-down inline decisions")); + +cl::opt UseContextCostForPreInliner( + "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), + cl::desc("Use context-sensitive byte size cost for preinliner decisions")); +} // namespace llvm + +static cl::opt SamplePreInlineReplay( + "csspgo-replay-preinline", cl::Hidden, cl::init(false), + cl::desc( + "Replay previous inlining and adjust context profile accordingly")); + +static cl::opt CSPreinlMultiplierForPrevInl( + "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden, + cl::init(100), + cl::desc( + "Multiplier to bump up callsite threshold for previous inlining.")); + +CSPreInliner::CSPreInliner(SampleContextTracker &Tracker, + ProfiledBinary &Binary, ProfileSummary *Summary) + : UseContextCost(UseContextCostForPreInliner), + // TODO: Pass in a guid-to-name map in order for + // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes + // as their profile context. + ContextTracker(Tracker), Binary(Binary), Summary(Summary) { + // Set default preinliner hot/cold call site threshold tuned with CSSPGO. + // for good performance with reasonable profile size. + if (!SampleHotCallSiteThreshold.getNumOccurrences()) + SampleHotCallSiteThreshold = 1500; + if (!SampleColdCallSiteThreshold.getNumOccurrences()) + SampleColdCallSiteThreshold = 0; + if (!ProfileInlineLimitMax.getNumOccurrences()) + ProfileInlineLimitMax = 50000; +} + +std::vector CSPreInliner::buildTopDownOrder() { + std::vector Order; + // Trim cold edges to get a more stable call graph. This allows for a more + // stable top-down order which in turns helps the stablity of the generated + // profile from run to run. + uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); + ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold); + + // Now that we have a profiled call graph, construct top-down order + // by building up SCC and reversing SCC order. + scc_iterator I = scc_begin(&ProfiledCG); + while (!I.isAtEnd()) { + auto Range = *I; + if (SortProfiledSCC) { + // Sort nodes in one SCC based on callsite hotness. + scc_member_iterator SI(*I); + Range = *SI; + } + for (auto *Node : Range) { + if (Node != ProfiledCG.getEntryNode()) + Order.push_back(Node->Name); + } + ++I; + } + std::reverse(Order.begin(), Order.end()); + + return Order; +} + +bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *CallerSamples) { + assert(CallerSamples && "Expect non-null caller samples"); + + // Ideally we want to consider everything a function calls, but as far as + // context profile is concerned, only those frames that are children of + // current one in the trie is relavent. So we walk the trie instead of call + // targets from function profile. + ContextTrieNode *CallerNode = + ContextTracker.getContextNodeForProfile(CallerSamples); + + bool HasNewCandidate = false; + for (auto &Child : CallerNode->getAllChildContext()) { + ContextTrieNode *CalleeNode = &Child.second; + FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); + if (!CalleeSamples) + continue; + + // Call site count is more reliable, so we look up the corresponding call + // target profile in caller's context profile to retrieve call site count. + uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate(); + uint64_t CallsiteCount = 0; + LineLocation Callsite = CalleeNode->getCallSiteLoc(); + if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { + auto It = CallTargets->find(CalleeSamples->getFunction()); + if (It != CallTargets->end()) + CallsiteCount = It->second; + } + + // TODO: call site and callee entry count should be mostly consistent, add + // check for that. + HasNewCandidate = true; + uint32_t CalleeSize = getFuncSize(CalleeNode); + CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), + CalleeSize); + } + + return HasNewCandidate; +} + +uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) { + if (UseContextCost) + return Binary.getFuncSizeForContext(ContextNode); + + return ContextNode->getFunctionSamples()->getBodySamples().size(); +} + +bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { + bool WasInlined = + Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); + // If replay inline is requested, simply follow the inline decision of the + // profiled binary. + if (SamplePreInlineReplay) + return WasInlined; + + unsigned int SampleThreshold = SampleColdCallSiteThreshold; + uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); + + if (Candidate.CallsiteCount <= ColdCountThreshold) + SampleThreshold = SampleColdCallSiteThreshold; + else { + // Linearly adjust threshold based on normalized hotness, i.e, a value in + // [0,1]. Use 10% cutoff instead of the max count as the normalization + // upperbound for stability. + double NormalizationUpperBound = + ProfileSummaryBuilder::getEntryForPercentile( + Summary->getDetailedSummary(), 100000 /* 10% */) + .MinCount; + double NormalizationLowerBound = ColdCountThreshold; + double NormalizedHotness = + (Candidate.CallsiteCount - NormalizationLowerBound) / + (NormalizationUpperBound - NormalizationLowerBound); + if (NormalizedHotness > 1.0) + NormalizedHotness = 1.0; + // Add 1 to ensure hot callsites get a non-zero threshold, which could + // happen when SampleColdCallSiteThreshold is 0. This is when we do not + // want any inlining for cold callsites. + SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 + + SampleColdCallSiteThreshold + 1; + // Bump up the threshold to favor previous compiler inline decision. The + // compiler has more insight and knowledge about functions based on their IR + // and attribures and should be able to make a more reasonable inline + // decision. + if (WasInlined) + SampleThreshold *= CSPreinlMultiplierForPrevInl; + } + + return (Candidate.SizeCost < SampleThreshold); +} + +void CSPreInliner::processFunction(const FunctionId Name) { + FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); + if (!FSamples) + return; + + unsigned FuncSize = + getFuncSize(ContextTracker.getContextNodeForProfile(FSamples)); + unsigned FuncFinalSize = FuncSize; + unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + + LLVM_DEBUG(dbgs() << "Process " << Name + << " for context-sensitive pre-inlining (pre-inline size: " + << FuncSize << ", size limit: " << SizeLimit << ")\n"); + + ProfiledCandidateQueue CQueue; + getInlineCandidates(CQueue, FSamples); + + while (!CQueue.empty() && FuncFinalSize < SizeLimit) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool ShouldInline = false; + if ((ShouldInline = shouldInline(Candidate))) { + // We mark context as inlined as the corresponding context profile + // won't be merged into that function's base profile. + ++PreInlNumCSInlined; + ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); + Candidate.CalleeSamples->getContext().setAttribute( + ContextShouldBeInlined); + FuncFinalSize += Candidate.SizeCost; + getInlineCandidates(CQueue, Candidate.CalleeSamples); + } else { + ++PreInlNumCSNotInlined; + } + LLVM_DEBUG( + dbgs() << (ShouldInline ? " Inlined" : " Outlined") + << " context profile for: " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (callee size: " << Candidate.SizeCost + << ", call count:" << Candidate.CallsiteCount << ")\n"); + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++PreInlNumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++PreInlNumCSInlinedHitMinLimit; + else + ++PreInlNumCSInlinedHitGrowthLimit; + } + + LLVM_DEBUG({ + if (!CQueue.empty()) + dbgs() << " Inline candidates ignored due to size limit (inliner " + "original size: " + << FuncSize << ", inliner final size: " << FuncFinalSize + << ", size limit: " << SizeLimit << ")\n"; + + while (!CQueue.empty()) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool WasInlined = + Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); + dbgs() << " " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (candidate size:" << Candidate.SizeCost + << ", call count: " << Candidate.CallsiteCount << ", previously " + << (WasInlined ? "inlined)\n" : "not inlined)\n"); + } + }); +} + +void CSPreInliner::run() { +#ifndef NDEBUG + auto printProfileNames = [](SampleContextTracker &ContextTracker, + bool IsInput) { + uint32_t Size = 0; + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + Size++; + dbgs() << " [" << ContextTracker.getContextString(Node) << "] " + << FSamples->getTotalSamples() << ":" + << FSamples->getHeadSamples() << "\n"; + } + } + dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" + << Size << " total):\n"; + }; +#endif + + LLVM_DEBUG(printProfileNames(ContextTracker, true)); + + // Execute global pre-inliner to estimate a global top-down inline + // decision and merge profiles accordingly. This helps with profile + // merge for ThinLTO otherwise we won't be able to merge profiles back + // to base profile across module/thin-backend boundaries. + // It also helps better compress context profile to control profile + // size, as we now only need context profile for functions going to + // be inlined. + for (FunctionId FuncName : buildTopDownOrder()) { + processFunction(FuncName); + } + + // Not inlined context profiles are merged into its base, so we can + // trim out such profiles from the output. + for (auto *Node : ContextTracker) { + FunctionSamples *FProfile = Node->getFunctionSamples(); + if (FProfile && + (Node->getParentContext() != &ContextTracker.getRootContext() && + !FProfile->getContext().hasState(InlinedContext))) { + Node->setFunctionSamples(nullptr); + } + } + FunctionSamples::ProfileIsPreInlined = true; + + LLVM_DEBUG(printProfileNames(ContextTracker, false)); +} diff --git a/tools/ldc-profgen/ldc-profgen-19.1/CSPreInliner.h b/tools/ldc-profgen/ldc-profgen-19.1/CSPreInliner.h new file mode 100644 index 00000000000..8a3f16a4f13 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/CSPreInliner.h @@ -0,0 +1,96 @@ +//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H +#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H + +#include "ProfiledBinary.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Transforms/IPO/ProfiledCallGraph.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Inline candidate seen from profile +struct ProfiledInlineCandidate { + ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count, + uint32_t Size) + : CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {} + // Context-sensitive function profile for inline candidate + const FunctionSamples *CalleeSamples; + // Call site count for an inline candidate + // TODO: make sure entry count for context profile and call site + // target count for corresponding call are consistent. + uint64_t CallsiteCount; + // Size proxy for function under particular call context. + uint64_t SizeCost; +}; + +// Inline candidate comparer using call site weight +struct ProfiledCandidateComparer { + bool operator()(const ProfiledInlineCandidate &LHS, + const ProfiledInlineCandidate &RHS) { + // Always prioritize inlining zero-sized functions as they do not affect the + // size budget. This could happen when all of the callee's code is gone and + // only pseudo probes are left. + if ((LHS.SizeCost == 0 || RHS.SizeCost == 0) && + (LHS.SizeCost != RHS.SizeCost)) + return RHS.SizeCost == 0; + + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + if (LHS.SizeCost != RHS.SizeCost) + return LHS.SizeCost > RHS.SizeCost; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID() < RHS.CalleeSamples->getGUID(); + } +}; + +using ProfiledCandidateQueue = + PriorityQueue, + ProfiledCandidateComparer>; + +// Pre-compilation inliner based on context-sensitive profile. +// The PreInliner estimates inline decision using hotness from profile +// and cost estimation from machine code size. It helps merges context +// profile globally and achieves better post-inine profile quality, which +// otherwise won't be possible for ThinLTO. It also reduce context profile +// size by only keep context that is estimated to be inlined. +class CSPreInliner { +public: + CSPreInliner(SampleContextTracker &Tracker, ProfiledBinary &Binary, + ProfileSummary *Summary); + void run(); + +private: + bool getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *FCallerContextSamples); + std::vector buildTopDownOrder(); + void processFunction(FunctionId Name); + bool shouldInline(ProfiledInlineCandidate &Candidate); + uint32_t getFuncSize(const ContextTrieNode *ContextNode); + bool UseContextCost; + SampleContextTracker &ContextTracker; + ProfiledBinary &Binary; + ProfileSummary *Summary; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-19.1/CallContext.h b/tools/ldc-profgen/ldc-profgen-19.1/CallContext.h new file mode 100644 index 00000000000..574833bfe8b --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/CallContext.h @@ -0,0 +1,58 @@ +//===-- CallContext.h - Call Context Handler ---------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H +#define LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H + +#include "llvm/ProfileData/SampleProf.h" +#include +#include + +namespace llvm { +namespace sampleprof { + +inline std::string getCallSite(const SampleContextFrame &Callsite) { + std::string CallsiteStr = Callsite.Func.str(); + CallsiteStr += ":"; + CallsiteStr += Twine(Callsite.Location.LineOffset).str(); + if (Callsite.Location.Discriminator > 0) { + CallsiteStr += "."; + CallsiteStr += Twine(Callsite.Location.Discriminator).str(); + } + return CallsiteStr; +} + +// TODO: This operation is expansive. If it ever gets called multiple times we +// may think of making a class wrapper with internal states for it. +inline std::string getLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : Context) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +// Reverse call context, i.e., in the order of callee frames to caller frames, +// is useful during instruction printing or pseudo probe printing. +inline std::string +getReversedLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : reverse(Context)) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-19.1/ErrorHandling.h b/tools/ldc-profgen/ldc-profgen-19.1/ErrorHandling.h new file mode 100644 index 00000000000..b797add8a89 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/ErrorHandling.h @@ -0,0 +1,56 @@ +//===-- ErrorHandling.h - Error handler -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H +#define LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H + +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/WithColor.h" +#include + +using namespace llvm; + +[[noreturn]] inline void exitWithError(const Twine &Message, + StringRef Whence = StringRef(), + StringRef Hint = StringRef()) { + WithColor::error(errs(), "llvm-profgen"); + if (!Whence.empty()) + errs() << Whence.str() << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint.str() << "\n"; + ::exit(EXIT_FAILURE); +} + +[[noreturn]] inline void exitWithError(std::error_code EC, + StringRef Whence = StringRef()) { + exitWithError(EC.message(), Whence); +} + +[[noreturn]] inline void exitWithError(Error E, StringRef Whence) { + exitWithError(errorToErrorCode(std::move(E)), Whence); +} + +template +T unwrapOrError(Expected EO, Ts &&... Args) { + if (EO) + return std::move(*EO); + exitWithError(EO.takeError(), std::forward(Args)...); +} + +inline void emitWarningSummary(uint64_t Num, uint64_t Total, StringRef Msg) { + if (!Total || !Num) + return; + WithColor::warning() << format("%.2f", static_cast(Num) * 100 / Total) + << "%(" << Num << "/" << Total << ") " << Msg << "\n"; +} + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-19.1/MissingFrameInferrer.cpp b/tools/ldc-profgen/ldc-profgen-19.1/MissingFrameInferrer.cpp new file mode 100644 index 00000000000..ee49950f39c --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/MissingFrameInferrer.cpp @@ -0,0 +1,316 @@ +//===-- MissingFrameInferrer.cpp - Missing frame inferrer --------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MissingFrameInferrer.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include +#include +#include +#include +#include + +#define DEBUG_TYPE "missing-frame-inferrer" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(TailCallUniReachable, + "Number of frame pairs reachable via a unique tail call path"); +STATISTIC(TailCallMultiReachable, + "Number of frame pairs reachable via a multiple tail call paths"); +STATISTIC(TailCallUnreachable, + "Number of frame pairs unreachable via any tail call path"); +STATISTIC(TailCallFuncSingleTailCalls, + "Number of functions with single tail call site"); +STATISTIC(TailCallFuncMultipleTailCalls, + "Number of functions with multiple tail call sites"); +STATISTIC(TailCallMaxTailCallPath, "Length of the longest tail call path"); + +static cl::opt + MaximumSearchDepth("max-search-depth", cl::init(UINT32_MAX - 1), + cl::desc("The maximum levels the DFS-based missing " + "frame search should go with")); + +void MissingFrameInferrer::initialize( + const ContextSampleCounterMap *SampleCounters) { + // Refine call edges based on LBR samples. + if (SampleCounters) { + std::unordered_map> SampledCalls; + std::unordered_map> SampledTailCalls; + + // Populate SampledCalls based on static call sites. Similarly to + // SampledTailCalls. + for (const auto &CI : *SampleCounters) { + for (auto Item : CI.second.BranchCounter) { + auto From = Item.first.first; + auto To = Item.first.second; + if (CallEdges.count(From)) { + assert(CallEdges[From].size() == 1 && + "A callsite should only appear once with either a known or a " + "zero (unknown) target value at this point"); + SampledCalls[From].insert(To); + } + if (TailCallEdges.count(From)) { + assert(TailCallEdges[From].size() == 1 && + "A callsite should only appear once with either a known or a " + "zero (unknown) target value at this point"); + FuncRange *FromFRange = Binary->findFuncRange(From); + FuncRange *ToFRange = Binary->findFuncRange(To); + if (FromFRange != ToFRange) + SampledTailCalls[From].insert(To); + } + } + } + + // Replace static edges with dynamic edges. + CallEdges = SampledCalls; + TailCallEdges = SampledTailCalls; + } + + // Populate function-based edges. This is to speed up address to function + // translation. + for (auto Call : CallEdges) + for (auto Target : Call.second) + if (FuncRange *ToFRange = Binary->findFuncRange(Target)) + CallEdgesF[Call.first].insert(ToFRange->Func); + + for (auto Call : TailCallEdges) { + for (auto Target : Call.second) { + if (FuncRange *ToFRange = Binary->findFuncRange(Target)) { + TailCallEdgesF[Call.first].insert(ToFRange->Func); + TailCallTargetFuncs.insert(ToFRange->Func); + } + } + if (FuncRange *FromFRange = Binary->findFuncRange(Call.first)) + FuncToTailCallMap[FromFRange->Func].push_back(Call.first); + } + +#if LLVM_ENABLE_STATS + for (auto F : FuncToTailCallMap) { + assert(F.second.size() > 0 && ""); + if (F.second.size() > 1) + TailCallFuncMultipleTailCalls++; + else + TailCallFuncSingleTailCalls++; + } +#endif + +#ifndef NDEBUG + auto PrintCallTargets = + [&](const std::unordered_map> + &CallTargets, + bool IsTailCall) { + for (const auto &Targets : CallTargets) { + for (const auto &Target : Targets.second) { + dbgs() << (IsTailCall ? "TailCall" : "Call"); + dbgs() << " From " << format("%8" PRIx64, Targets.first) << " to " + << format("%8" PRIx64, Target) << "\n"; + } + } + }; + + LLVM_DEBUG(dbgs() << "============================\n "; + dbgs() << "Call targets:\n"; + PrintCallTargets(CallEdges, false); + dbgs() << "\nTail call targets:\n"; + PrintCallTargets(CallEdges, true); + dbgs() << "============================\n";); +#endif +} + +uint64_t MissingFrameInferrer::computeUniqueTailCallPath( + BinaryFunction *From, BinaryFunction *To, SmallVectorImpl &Path) { + // Search for a unique path comprised of only tail call edges for a given + // source and target frame address on the a tail call graph that consists of + // only tail call edges. Note that only a unique path counts. Multiple paths + // are treated unreachable. + if (From == To) + return 1; + + // Ignore cyclic paths. Since we are doing a recursive DFS walk, if the source + // frame being visited is already in the stack, it means we are seeing a + // cycle. This is done before querying the cached result because the cached + // result may be computed based on the same path. Consider the following case: + // A -> B, B -> A, A -> D + // When computing unique reachablity from A to D, the cached result for (B,D) + // should not be counted since the unique path B->A->D is basically the same + // path as A->D. Counting that with invalidate the uniqueness from A to D. + if (Visiting.contains(From)) + return 0; + + // If already computed, return the cached result. + auto I = UniquePaths.find({From, To}); + if (I != UniquePaths.end()) { + Path.append(I->second.begin(), I->second.end()); + return 1; + } + + auto J = NonUniquePaths.find({From, To}); + if (J != NonUniquePaths.end()) { + return J->second; + } + + uint64_t Pos = Path.size(); + + // DFS walk each outgoing tail call edges. + // Bail out if we are already at the the maximum searching depth. + if (CurSearchingDepth == MaximumSearchDepth) + return 0; + + + if (!FuncToTailCallMap.count(From)) + return 0; + + CurSearchingDepth++; + Visiting.insert(From); + uint64_t NumPaths = 0; + for (auto TailCall : FuncToTailCallMap[From]) { + NumPaths += computeUniqueTailCallPath(TailCall, To, Path); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + CurSearchingDepth--; + Visiting.erase(From); + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) { + Path.pop_back_n(Path.size() - Pos); + } + + // Cache the result. + if (NumPaths == 1) { + UniquePaths[{From, To}].assign(Path.begin() + Pos, Path.end()); +#if LLVM_ENABLE_STATS + auto &LocalPath = UniquePaths[{From, To}]; + assert((LocalPath.size() <= MaximumSearchDepth + 1) && + "Path should not be longer than the maximum searching depth"); + TailCallMaxTailCallPath = std::max(uint64_t(LocalPath.size()), + TailCallMaxTailCallPath.getValue()); +#endif + } else { + NonUniquePaths[{From, To}] = NumPaths; + } + + return NumPaths; +} + +uint64_t MissingFrameInferrer::computeUniqueTailCallPath( + uint64_t From, BinaryFunction *To, SmallVectorImpl &Path) { + if (!TailCallEdgesF.count(From)) + return 0; + Path.push_back(From); + uint64_t NumPaths = 0; + for (auto Target : TailCallEdgesF[From]) { + NumPaths += computeUniqueTailCallPath(Target, To, Path); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) + Path.pop_back(); + return NumPaths; +} + +bool MissingFrameInferrer::inferMissingFrames( + uint64_t From, uint64_t To, SmallVectorImpl &UniquePath) { + assert(!TailCallEdgesF.count(From) && + "transition between From and To cannot be via a tailcall otherwise " + "they would not show up at the same time"); + UniquePath.push_back(From); + uint64_t Pos = UniquePath.size(); + + FuncRange *ToFRange = Binary->findFuncRange(To); + if (!ToFRange) + return false; + + // Bail out if caller has no known outgoing call edges. + if (!CallEdgesF.count(From)) + return false; + + // Done with the inference if the calle is reachable via a single callsite. + // This may not be accurate but it improves the search throughput. + if (llvm::is_contained(CallEdgesF[From], ToFRange->Func)) + return true; + + // Bail out if callee is not tailcall reachable at all. + if (!TailCallTargetFuncs.contains(ToFRange->Func)) + return false; + + Visiting.clear(); + CurSearchingDepth = 0; + uint64_t NumPaths = 0; + for (auto Target : CallEdgesF[From]) { + NumPaths += + computeUniqueTailCallPath(Target, ToFRange->Func, UniquePath); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) { + UniquePath.pop_back_n(UniquePath.size() - Pos); + assert(UniquePath.back() == From && "broken path"); + } + +#if LLVM_ENABLE_STATS + if (NumPaths == 1) { + if (ReachableViaUniquePaths.insert({From, ToFRange->StartAddress}).second) + TailCallUniReachable++; + } else if (NumPaths == 0) { + if (Unreachables.insert({From, ToFRange->StartAddress}).second) { + TailCallUnreachable++; + LLVM_DEBUG(dbgs() << "No path found from " + << format("%8" PRIx64 ":", From) << " to " + << format("%8" PRIx64 ":", ToFRange->StartAddress) + << "\n"); + } + } else if (NumPaths > 1) { + if (ReachableViaMultiPaths.insert({From, ToFRange->StartAddress}) + .second) { + TailCallMultiReachable++; + LLVM_DEBUG(dbgs() << "Multiple paths found from " + << format("%8" PRIx64 ":", From) << " to " + << format("%8" PRIx64 ":", ToFRange->StartAddress) + << "\n"); + } + } +#endif + + return NumPaths == 1; +} + +void MissingFrameInferrer::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + if (Context.size() == 1) { + NewContext = Context; + return; + } + + NewContext.clear(); + for (uint64_t I = 1; I < Context.size(); I++) { + inferMissingFrames(Context[I - 1], Context[I], NewContext); + } + NewContext.push_back(Context.back()); + + assert((NewContext.size() >= Context.size()) && + "Inferred context should include all frames in the original context"); + assert((NewContext.size() > Context.size() || NewContext == Context) && + "Inferred context should be exactly the same " + "with the original context"); +} diff --git a/tools/ldc-profgen/ldc-profgen-19.1/MissingFrameInferrer.h b/tools/ldc-profgen/ldc-profgen-19.1/MissingFrameInferrer.h new file mode 100644 index 00000000000..4680a9a979f --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/MissingFrameInferrer.h @@ -0,0 +1,116 @@ +//===-- MissingFrameInferrer.h - Missing frame inferrer ---------- C++/-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_MISSINGFRAMEINFERRER_H +#define LLVM_TOOLS_LLVM_PROFGEN_MISSINGFRAMEINFERRER_H + +#include "PerfReader.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include +#include + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; +struct BinaryFunction; + +class MissingFrameInferrer { +public: + MissingFrameInferrer(ProfiledBinary *Binary) : Binary(Binary) {} + + // Defininig a frame transition from a caller function to the callee function. + using CallerCalleePair = std::pair; + + void initialize(const ContextSampleCounterMap *SampleCounters); + + // Given an input `Context`, output `NewContext` with inferred missing tail + // call frames. + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + +private: + friend class ProfiledBinary; + + // Compute a unique tail call path for a pair of source frame address and + // target frame address. Append the unique path prefix (not including `To`) to + // `UniquePath` if exists. Return the whether this's a unqiue tail call + // path. The source/dest frame will typically be a pair of adjacent frame + // entries of call stack samples. + bool inferMissingFrames(uint64_t From, uint64_t To, + SmallVectorImpl &UniquePath); + + // Compute a unique tail call path from the source frame address to the target + // function. Output the unique path prefix (not including `To`) in + // `UniquePath` if exists. Return the number of possibly availabe tail call + // paths. + uint64_t computeUniqueTailCallPath(uint64_t From, BinaryFunction *To, + SmallVectorImpl &UniquePath); + + // Compute a unique tail call path from the source function to the target + // function. Output the unique path prefix (not including `To`) in + // `UniquePath` if exists. Return the number of possibly availabe tail call + // paths. + uint64_t computeUniqueTailCallPath(BinaryFunction *From, BinaryFunction *To, + SmallVectorImpl &UniquePath); + + ProfiledBinary *Binary; + + // A map of call instructions to their target addresses. This is first + // populated with static call edges but then trimmed down to dynamic call + // edges based on LBR samples. + std::unordered_map> CallEdges; + + // A map of tail call instructions to their target addresses. This is first + // populated with static call edges but then trimmed down to dynamic call + // edges based on LBR samples. + std::unordered_map> TailCallEdges; + + // Dynamic call targets in terms of BinaryFunction for any calls. + std::unordered_map> CallEdgesF; + + // Dynamic call targets in terms of BinaryFunction for tail calls. + std::unordered_map> + TailCallEdgesF; + + // Dynamic tail call targets of caller functions. + std::unordered_map> FuncToTailCallMap; + + // Functions that are reachable via tail calls. + DenseSet TailCallTargetFuncs; + + struct PairHash { + std::size_t operator()( + const std::pair &Pair) const { + return std::hash()(Pair.first) ^ + std::hash()(Pair.second); + } + }; + + // Cached results from a CallerCalleePair to a unique call path between them. + std::unordered_map, PairHash> + UniquePaths; + // Cached results from CallerCalleePair to the number of available call paths. + std::unordered_map NonUniquePaths; + + DenseSet Visiting; + + uint32_t CurSearchingDepth = 0; + +#if LLVM_ENABLE_STATS + DenseSet> ReachableViaUniquePaths; + DenseSet> Unreachables; + DenseSet> ReachableViaMultiPaths; +#endif +}; +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-19.1/PerfReader.cpp b/tools/ldc-profgen/ldc-profgen-19.1/PerfReader.cpp new file mode 100644 index 00000000000..4041271cc0a --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/PerfReader.cpp @@ -0,0 +1,1381 @@ +//===-- PerfReader.cpp - perfscript reader ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/ToolOutputFile.h" + +#define DEBUG_TYPE "perf-reader" + +cl::opt SkipSymbolization("skip-symbolization", + cl::desc("Dump the unsymbolized profile to the " + "output file. It will show unwinder " + "output for CS profile generation.")); + +static cl::opt ShowMmapEvents("show-mmap-events", + cl::desc("Print binary load events.")); + +static cl::opt + UseOffset("use-offset", cl::init(true), + cl::desc("Work with `--skip-symbolization` or " + "`--unsymbolized-profile` to write/read the " + "offset instead of virtual address.")); + +static cl::opt UseLoadableSegmentAsBase( + "use-first-loadable-segment-as-base", + cl::desc("Use first loadable segment address as base address " + "for offsets in unsymbolized profile. By default " + "first executable segment address is used")); + +static cl::opt + IgnoreStackSamples("ignore-stack-samples", + cl::desc("Ignore call stack samples for hybrid samples " + "and produce context-insensitive profile.")); +cl::opt ShowDetailedWarning("show-detailed-warning", + cl::desc("Show detailed warning message.")); +cl::opt + LeadingIPOnly("leading-ip-only", + cl::desc("Form a profile based only on sample IPs")); + +static cl::list PerfEventFilter( + "perf-event", + cl::desc("Ignore samples not matching the given event names")); +static cl::alias + PerfEventFilterPlural("perf-events", cl::CommaSeparated, + cl::desc("Comma-delimited version of -perf-event"), + cl::aliasopt(PerfEventFilter)); + +static cl::opt + SamplePeriod("sample-period", cl::init(1), + cl::desc("The sampling period (-c) used for perf data")); + +extern cl::opt PerfTraceFilename; +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt OutputFilename; + +namespace llvm { +namespace sampleprof { + +void VirtualUnwinder::unwindCall(UnwindState &State) { + uint64_t Source = State.getCurrentLBRSource(); + auto *ParentFrame = State.getParentFrame(); + // The 2nd frame after leaf could be missing if stack sample is + // taken when IP is within prolog/epilog, as frame chain isn't + // setup yet. Fill in the missing frame in that case. + // TODO: Currently we just assume all the addr that can't match the + // 2nd frame is in prolog/epilog. In the future, we will switch to + // pro/epi tracker(Dwarf CFI) for the precise check. + if (ParentFrame == State.getDummyRootPtr() || + ParentFrame->Address != Source) { + State.switchToFrame(Source); + if (ParentFrame != State.getDummyRootPtr()) { + if (Source == ExternalAddr) + NumMismatchedExtCallBranch++; + else + NumMismatchedProEpiBranch++; + } + } else { + State.popFrame(); + } + State.InstPtr.update(Source); +} + +void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) { + InstructionPointer &IP = State.InstPtr; + uint64_t Target = State.getCurrentLBRTarget(); + uint64_t End = IP.Address; + + if (End == ExternalAddr && Target == ExternalAddr) { + // Filter out the case when leaf external frame matches the external LBR + // target, this is a valid state, it happens that the code run into external + // address then return back. The call frame under the external frame + // remains valid and can be unwound later, just skip recording this range. + NumPairedExtAddr++; + return; + } + + if (End == ExternalAddr || Target == ExternalAddr) { + // Range is invalid if only one point is external address. This means LBR + // traces contains a standalone external address failing to pair another + // one, likely due to interrupt jmp or broken perf script. Set the + // state to invalid. + NumUnpairedExtAddr++; + State.setInvalid(); + return; + } + + if (!isValidFallThroughRange(Target, End, Binary)) { + // Skip unwinding the rest of LBR trace when a bogus range is seen. + State.setInvalid(); + return; + } + + if (Binary->usePseudoProbes()) { + // We don't need to top frame probe since it should be extracted + // from the range. + // The outcome of the virtual unwinding with pseudo probes is a + // map from a context key to the address range being unwound. + // This means basically linear unwinding is not needed for pseudo + // probes. The range will be simply recorded here and will be + // converted to a list of pseudo probes to report in ProfileGenerator. + State.getParentFrame()->recordRangeCount(Target, End, Repeat); + } else { + // Unwind linear execution part. + // Split and record the range by different inline context. For example: + // [0x01] ... main:1 # Target + // [0x02] ... main:2 + // [0x03] ... main:3 @ foo:1 + // [0x04] ... main:3 @ foo:2 + // [0x05] ... main:3 @ foo:3 + // [0x06] ... main:4 + // [0x07] ... main:5 # End + // It will be recorded: + // [main:*] : [0x06, 0x07], [0x01, 0x02] + // [main:3 @ foo:*] : [0x03, 0x05] + while (IP.Address > Target) { + uint64_t PrevIP = IP.Address; + IP.backward(); + // Break into segments for implicit call/return due to inlining + bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address); + if (!SameInlinee) { + State.switchToFrame(PrevIP); + State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat); + End = IP.Address; + } + } + assert(IP.Address == Target && "The last one must be the target address."); + // Record the remaining range, [0x01, 0x02] in the example + State.switchToFrame(IP.Address); + State.CurrentLeafFrame->recordRangeCount(IP.Address, End, Repeat); + } +} + +void VirtualUnwinder::unwindReturn(UnwindState &State) { + // Add extra frame as we unwind through the return + const LBREntry &LBR = State.getCurrentLBR(); + uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); + State.switchToFrame(CallAddr); + State.pushFrame(LBR.Source); + State.InstPtr.update(LBR.Source); +} + +void VirtualUnwinder::unwindBranch(UnwindState &State) { + // TODO: Tolerate tail call for now, as we may see tail call from libraries. + // This is only for intra function branches, excluding tail calls. + uint64_t Source = State.getCurrentLBRSource(); + State.switchToFrame(Source); + State.InstPtr.update(Source); +} + +std::shared_ptr FrameStack::getContextKey() { + std::shared_ptr KeyStr = + std::make_shared(); + KeyStr->Context = Binary->getExpandedContext(Stack, KeyStr->WasLeafInlined); + return KeyStr; +} + +std::shared_ptr AddressStack::getContextKey() { + std::shared_ptr KeyStr = std::make_shared(); + KeyStr->Context = Stack; + CSProfileGenerator::compressRecursionContext(KeyStr->Context); + CSProfileGenerator::trimContext(KeyStr->Context); + return KeyStr; +} + +template +void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, + T &Stack) { + if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty()) + return; + + std::shared_ptr Key = Stack.getContextKey(); + if (Key == nullptr) + return; + auto Ret = CtxCounterMap->emplace(Hashable(Key), SampleCounter()); + SampleCounter &SCounter = Ret.first->second; + for (auto &I : Cur->RangeSamples) + SCounter.recordRangeCount(std::get<0>(I), std::get<1>(I), std::get<2>(I)); + + for (auto &I : Cur->BranchSamples) + SCounter.recordBranchCount(std::get<0>(I), std::get<1>(I), std::get<2>(I)); +} + +template +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur, T &Stack) { + if (!Cur->isDummyRoot()) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see. + if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) { + // Process truncated context + // Start a new traversal ignoring its bottom context + T EmptyStack(Binary); + collectSamplesFromFrame(Cur, EmptyStack); + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), EmptyStack); + } + + // Keep note of untracked call site and deduplicate them + // for warning later. + if (!Cur->isLeafFrame()) + UntrackedCallsites.insert(Cur->Address); + + return; + } + } + + collectSamplesFromFrame(Cur, Stack); + // Process children frame + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), Stack); + } + // Recover the call stack + Stack.popFrame(); +} + +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur) { + if (Binary->usePseudoProbes()) { + AddressStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } else { + FrameStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } +} + +void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, + UnwindState &State, uint64_t Repeat) { + if (Branch.Target == ExternalAddr) + return; + + // Record external-to-internal pattern on the trie root, it later can be + // used for generating head samples. + if (Branch.Source == ExternalAddr) { + State.getDummyRootPtr()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + return; + } + + if (Binary->usePseudoProbes()) { + // Same as recordRangeCount, We don't need to top frame probe since we will + // extract it from branch's source address + State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } else { + State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } +} + +bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) { + // Capture initial state as starting point for unwinding. + UnwindState State(Sample, Binary); + + // Sanity check - making sure leaf of LBR aligns with leaf of stack sample + // Stack sample sometimes can be unreliable, so filter out bogus ones. + if (!State.validateInitialState()) + return false; + + NumTotalBranches += State.LBRStack.size(); + // Now process the LBR samples in parrallel with stack sample + // Note that we do not reverse the LBR entry order so we can + // unwind the sample stack as we walk through LBR entries. + while (State.hasNextLBR()) { + State.checkStateConsistency(); + + // Do not attempt linear unwind for the leaf range as it's incomplete. + if (!State.IsLastLBR()) { + // Unwind implicit calls/returns from inlining, along the linear path, + // break into smaller sub section each with its own calling context. + unwindLinear(State, Repeat); + } + + // Save the LBR branch before it gets unwound. + const LBREntry &Branch = State.getCurrentLBR(); + if (isCallState(State)) { + // Unwind calls - we know we encountered call if LBR overlaps with + // transition between leaf the 2nd frame. Note that for calls that + // were not in the original stack sample, we should have added the + // extra frame when processing the return paired with this call. + unwindCall(State); + } else if (isReturnState(State)) { + // Unwind returns - check whether the IP is indeed at a return + // instruction + unwindReturn(State); + } else if (isValidState(State)) { + // Unwind branches + unwindBranch(State); + } else { + // Skip unwinding the rest of LBR trace. Reset the stack and update the + // state so that the rest of the trace can still be processed as if they + // do not have stack samples. + State.clearCallStack(); + State.InstPtr.update(State.getCurrentLBRSource()); + State.pushFrame(State.InstPtr.Address); + } + + State.advanceLBR(); + // Record `branch` with calling context after unwinding. + recordBranchCount(Branch, State, Repeat); + } + // As samples are aggregated on trie, record them into counter map + collectSamplesFromFrameTrie(State.getDummyRootPtr()); + + return true; +} + +std::unique_ptr +PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput, + std::optional PIDFilter) { + std::unique_ptr PerfReader; + + if (PerfInput.Format == PerfFormat::UnsymbolizedProfile) { + PerfReader.reset( + new UnsymbolizedProfileReader(Binary, PerfInput.InputFile)); + return PerfReader; + } + + // For perf data input, we need to convert them into perf script first. + // If this is a kernel perf file, there is no need for retrieving PIDs. + if (PerfInput.Format == PerfFormat::PerfData) + PerfInput = PerfScriptReader::convertPerfDataToTrace( + Binary, Binary->isKernel(), PerfInput, PIDFilter); + + assert((PerfInput.Format == PerfFormat::PerfScript) && + "Should be a perfscript!"); + + PerfInput.Content = + PerfScriptReader::checkPerfScriptType(PerfInput.InputFile); + if (PerfInput.Content == PerfContent::LBRStack) { + PerfReader.reset( + new HybridPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else if (PerfInput.Content == PerfContent::LBR) { + PerfReader.reset(new LBRPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else { + exitWithError("Unsupported perfscript!"); + } + + return PerfReader; +} + +PerfInputFile +PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, bool SkipPID, + PerfInputFile &File, + std::optional PIDFilter) { + StringRef PerfData = File.InputFile; + // Run perf script to retrieve PIDs matching binary we're interested in. + auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf"); + if (!PerfExecutable) { + exitWithError("Perf not found."); + } + std::string PerfPath = *PerfExecutable; + SmallString<128> PerfTraceFile; + sys::fs::createUniquePath("perf-script-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%.tmp", + PerfTraceFile, /*MakeAbsolute=*/true); + std::string ErrorFile = std::string(PerfTraceFile) + ".err"; + std::optional Redirects[] = {std::nullopt, // Stdin + StringRef(PerfTraceFile), // Stdout + StringRef(ErrorFile)}; // Stderr + PerfScriptReader::TempFileCleanups.emplace_back(PerfTraceFile); + PerfScriptReader::TempFileCleanups.emplace_back(ErrorFile); + + std::string PIDs; + if (!SkipPID) { + StringRef ScriptMMapArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "comm,pid", "-i", + PerfData}; + sys::ExecuteAndWait(PerfPath, ScriptMMapArgs, std::nullopt, Redirects); + + // Collect the PIDs + TraceStream TraceIt(PerfTraceFile); + std::unordered_set PIDSet; + while (!TraceIt.isAtEoF()) { + MMapEvent MMap; + if (isMMapEvent(TraceIt.getCurrentLine()) && + extractMMapEventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) { + auto It = PIDSet.emplace(MMap.PID); + if (It.second && (!PIDFilter || MMap.PID == *PIDFilter)) { + if (!PIDs.empty()) { + PIDs.append(","); + } + PIDs.append(utostr(MMap.PID)); + } + } + TraceIt.advance(); + } + + if (PIDs.empty()) { + exitWithError("No relevant mmap event is found in perf data."); + } + } + + // If filtering by events was requested, additionally request the "event" + // field. + const std::string FieldList = + PerfEventFilter.empty() ? "ip,brstack" : "event,ip,brstack"; + + // Run perf script again to retrieve events for PIDs collected above + SmallVector ScriptSampleArgs; + ScriptSampleArgs.push_back(PerfPath); + ScriptSampleArgs.push_back("script"); + ScriptSampleArgs.push_back("--show-mmap-events"); + ScriptSampleArgs.push_back("-F"); + ScriptSampleArgs.push_back(FieldList); + ScriptSampleArgs.push_back("-i"); + ScriptSampleArgs.push_back(PerfData); + if (!PIDs.empty()) { + ScriptSampleArgs.push_back("--pid"); + ScriptSampleArgs.push_back(PIDs); + } + sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, std::nullopt, Redirects); + + return {std::string(PerfTraceFile), PerfFormat::PerfScript, + PerfContent::UnknownContent}; +} + +static StringRef filename(StringRef Path, bool UseBackSlash) { + llvm::sys::path::Style PathStyle = + UseBackSlash ? llvm::sys::path::Style::windows_backslash + : llvm::sys::path::Style::native; + StringRef FileName = llvm::sys::path::filename(Path, PathStyle); + + // In case this file use \r\n as newline. + if (UseBackSlash && FileName.back() == '\r') + return FileName.drop_back(); + + return FileName; +} + +void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) { + // Drop the event which doesn't belong to user-provided binary + StringRef BinaryName = filename(Event.BinaryPath, Binary->isCOFF()); + bool IsKernel = Binary->isKernel(); + if (!IsKernel && Binary->getName() != BinaryName) + return; + if (IsKernel && !Binary->isKernelImageName(BinaryName)) + return; + + // Drop the event if process does not match pid filter + if (PIDFilter && Event.PID != *PIDFilter) + return; + + // Drop the event if its image is loaded at the same address + if (Event.Address == Binary->getBaseAddress()) { + Binary->setIsLoadedByMMap(true); + return; + } + + if (IsKernel || Event.Offset == Binary->getTextSegmentOffset()) { + // A binary image could be unloaded and then reloaded at different + // place, so update binary load address. + // Only update for the first executable segment and assume all other + // segments are loaded at consecutive memory addresses, which is the case on + // X64. + Binary->setBaseAddress(Event.Address); + Binary->setIsLoadedByMMap(true); + } else { + // Verify segments are loaded consecutively. + const auto &Offsets = Binary->getTextSegmentOffsets(); + auto It = llvm::lower_bound(Offsets, Event.Offset); + if (It != Offsets.end() && *It == Event.Offset) { + // The event is for loading a separate executable segment. + auto I = std::distance(Offsets.begin(), It); + const auto &PreferredAddrs = Binary->getPreferredTextSegmentAddresses(); + if (PreferredAddrs[I] - Binary->getPreferredBaseAddress() != + Event.Address - Binary->getBaseAddress()) + exitWithError("Executable segments not loaded consecutively"); + } else { + if (It == Offsets.begin()) + exitWithError("File offset not found"); + else { + // Find the segment the event falls in. A large segment could be loaded + // via multiple mmap calls with consecutive memory addresses. + --It; + assert(*It < Event.Offset); + if (Event.Offset - *It != Event.Address - Binary->getBaseAddress()) + exitWithError("Segment not loaded by consecutive mmaps"); + } + } + } +} + +static std::string getContextKeyStr(ContextKey *K, + const ProfiledBinary *Binary) { + if (const auto *CtxKey = dyn_cast(K)) { + return SampleContext::getContextString(CtxKey->Context); + } else if (const auto *CtxKey = dyn_cast(K)) { + std::ostringstream OContextStr; + for (uint32_t I = 0; I < CtxKey->Context.size(); I++) { + if (OContextStr.str().size()) + OContextStr << " @ "; + uint64_t Address = CtxKey->Context[I]; + if (UseOffset) { + if (UseLoadableSegmentAsBase) + Address -= Binary->getFirstLoadableAddress(); + else + Address -= Binary->getPreferredBaseAddress(); + } + OContextStr << "0x" + << utohexstr(Address, + /*LowerCase=*/true); + } + return OContextStr.str(); + } else { + llvm_unreachable("unexpected key type"); + } +} + +void HybridPerfReader::unwindSamples() { + VirtualUnwinder Unwinder(&SampleCounters, Binary); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + Unwinder.unwind(Sample, Item.second); + } + + // Warn about untracked frames due to missing probes. + if (ShowDetailedWarning) { + for (auto Address : Unwinder.getUntrackedCallsites()) + WithColor::warning() << "Profile context truncated due to missing probe " + << "for call instruction at " + << format("0x%" PRIx64, Address) << "\n"; + } + + emitWarningSummary(Unwinder.getUntrackedCallsites().size(), + SampleCounters.size(), + "of profiled contexts are truncated due to missing probe " + "for call instruction."); + + emitWarningSummary( + Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to unwinding error of external frame."); + + emitWarningSummary(Unwinder.NumPairedExtAddr * 2, Unwinder.NumTotalBranches, + "of branches containing paired external address."); + + emitWarningSummary(Unwinder.NumUnpairedExtAddr, Unwinder.NumTotalBranches, + "of branches containing external address but doesn't have " + "another external address to pair, likely due to " + "interrupt jmp or broken perf script."); + + emitWarningSummary( + Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to frame in prolog/epilog."); + + emitWarningSummary(Unwinder.NumMissingExternalFrame, + Unwinder.NumExtCallBranch, + "of artificial call branches but doesn't have an external " + "frame to match."); +} + +bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack) { + // The raw format of LBR stack is like: + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 + // It's in FIFO order and separated by whitespace. + SmallVector Records; + TraceIt.getCurrentLine().rtrim().split(Records, " ", -1, false); + auto WarnInvalidLBR = [](TraceStream &TraceIt) { + WithColor::warning() << "Invalid address in LBR record at line " + << TraceIt.getLineNumber() << ": " + << TraceIt.getCurrentLine() << "\n"; + }; + + // Skip the leading instruction pointer. + size_t Index = 0; + + StringRef EventName; + // Skip a perf event name. This may or may not exist. + if (Records.size() > Index && Records[Index].ends_with(":")) { + EventName = Records[Index].ltrim().rtrim(':'); + Index++; + + if (PerfEventFilter.empty()) { + WithColor::warning() << "No --perf-event filter was specified, but an " + "\"event\" field was found in line " + << TraceIt.getLineNumber() << ": " + << TraceIt.getCurrentLine() << "\n"; + } else if (std::find(PerfEventFilter.begin(), PerfEventFilter.end(), + EventName) == PerfEventFilter.end()) { + TraceIt.advance(); + return false; + } + + } else if (!PerfEventFilter.empty()) { + WithColor::warning() << "A --perf-event filter was specified, but no " + "\"event\" field found in line " + << TraceIt.getLineNumber() << ": " + << TraceIt.getCurrentLine() << "\n"; + } + + uint64_t LeadingAddr; + if (Records.size() > Index && !Records[Index].contains('/')) { + if (Records[Index].getAsInteger(16, LeadingAddr)) { + WarnInvalidLBR(TraceIt); + TraceIt.advance(); + return false; + } + Index++; + } + + // We assume that if we saw an event name we also saw a leading addr. + // In other words, LeadingAddr is set if Index is 1 or 2. + if (LeadingIPOnly && Index > 0) { + // Form a profile only from the sample IP. Do not assume an LBR stack + // follows, and ignore it if it does. + uint64_t SampleIP = Binary->canonicalizeVirtualAddress(LeadingAddr); + bool SampleIPIsInternal = Binary->addressIsCode(SampleIP); + if (SampleIPIsInternal) { + // Form a half LBR entry where the sample IP is the destination. + LBRStack.emplace_back(LBREntry(SampleIP, SampleIP)); + } + TraceIt.advance(); + return !LBRStack.empty(); + } + + // Now extract LBR samples - note that we do not reverse the + // LBR entry order so we can unwind the sample stack as we walk + // through LBR entries. + while (Index < Records.size()) { + auto &Token = Records[Index++]; + if (Token.size() == 0) + continue; + + SmallVector Addresses; + Token.split(Addresses, "/"); + uint64_t Src; + uint64_t Dst; + + // Stop at broken LBR records. + if (Addresses.size() < 2 || Addresses[0].substr(2).getAsInteger(16, Src) || + Addresses[1].substr(2).getAsInteger(16, Dst)) { + WarnInvalidLBR(TraceIt); + break; + } + + // Canonicalize to use preferred load address as base address. + Src = Binary->canonicalizeVirtualAddress(Src); + Dst = Binary->canonicalizeVirtualAddress(Dst); + bool SrcIsInternal = Binary->addressIsCode(Src); + bool DstIsInternal = Binary->addressIsCode(Dst); + if (!SrcIsInternal) + Src = ExternalAddr; + if (!DstIsInternal) + Dst = ExternalAddr; + // Filter external-to-external case to reduce LBR trace size. + if (!SrcIsInternal && !DstIsInternal) + continue; + + LBRStack.emplace_back(LBREntry(Src, Dst)); + } + TraceIt.advance(); + return !LBRStack.empty(); +} + +bool PerfScriptReader::extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack) { + // The raw format of call stack is like: + // 4005dc # leaf frame + // 400634 + // 400684 # root frame + // It's in bottom-up order with each frame in one line. + + // Extract stack frames from sample + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().starts_with(" 0x")) { + StringRef FrameStr = TraceIt.getCurrentLine().ltrim(); + uint64_t FrameAddr = 0; + if (FrameStr.getAsInteger(16, FrameAddr)) { + // We might parse a non-perf sample line like empty line and comments, + // skip it + TraceIt.advance(); + return false; + } + TraceIt.advance(); + + FrameAddr = Binary->canonicalizeVirtualAddress(FrameAddr); + // Currently intermixed frame from different binaries is not supported. + if (!Binary->addressIsCode(FrameAddr)) { + if (CallStack.empty()) + NumLeafExternalFrame++; + // Push a special value(ExternalAddr) for the external frames so that + // unwinder can still work on this with artificial Call/Return branch. + // After unwinding, the context will be truncated for external frame. + // Also deduplicate the consecutive external addresses. + if (CallStack.empty() || CallStack.back() != ExternalAddr) + CallStack.emplace_back(ExternalAddr); + continue; + } + + // We need to translate return address to call address for non-leaf frames. + if (!CallStack.empty()) { + auto CallAddr = Binary->getCallAddrFromFrameAddr(FrameAddr); + if (!CallAddr) { + // Stop at an invalid return address caused by bad unwinding. This could + // happen to frame-pointer-based unwinding and the callee functions that + // do not have the frame pointer chain set up. + InvalidReturnAddresses.insert(FrameAddr); + break; + } + FrameAddr = CallAddr; + } + + CallStack.emplace_back(FrameAddr); + } + + // Strip out the bottom external addr. + if (CallStack.size() > 1 && CallStack.back() == ExternalAddr) + CallStack.pop_back(); + + // Skip other unrelated line, find the next valid LBR line + // Note that even for empty call stack, we should skip the address at the + // bottom, otherwise the following pass may generate a truncated callstack + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().starts_with(" 0x")) { + TraceIt.advance(); + } + // Filter out broken stack sample. We may not have complete frame info + // if sample end up in prolog/epilog, the result is dangling context not + // connected to entry point. This should be relatively rare thus not much + // impact on overall profile quality. However we do want to filter them + // out to reduce the number of different calling contexts. One instance + // of such case - when sample landed in prolog/epilog, somehow stack + // walking will be broken in an unexpected way that higher frames will be + // missing. + return !CallStack.empty() && + !Binary->addressInPrologEpilog(CallStack.front()); +} + +void PerfScriptReader::warnIfMissingMMap() { + if (!Binary->getMissingMMapWarned() && !Binary->getIsLoadedByMMap()) { + WithColor::warning() << "No relevant mmap event is matched for " + << Binary->getName() + << ", will use preferred address (" + << format("0x%" PRIx64, + Binary->getPreferredBaseAddress()) + << ") as the base loading address!\n"; + // Avoid redundant warning, only warn at the first unmatched sample. + Binary->setMissingMMapWarned(true); + } +} + +void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + // The raw hybird sample started with call stack in FILO order and followed + // intermediately by LBR sample + // e.g. + // 4005dc # call stack leaf + // 400634 + // 400684 # call stack root + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries + // + std::shared_ptr Sample = std::make_shared(); +#ifndef NDEBUG + Sample->Linenum = TraceIt.getLineNumber(); +#endif + // Parsing call stack and populate into PerfSample.CallStack + if (!extractCallstack(TraceIt, Sample->CallStack)) { + // Skip the next LBR line matched current call stack + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().starts_with(" 0x")) + TraceIt.advance(); + return; + } + + warnIfMissingMMap(); + + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().starts_with(" 0x")) { + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + if (IgnoreStackSamples) { + Sample->CallStack.clear(); + } else { + // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR + // ranges + Sample->CallStack.front() = Sample->LBRStack[0].Target; + } + // Record samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } + } else { + // LBR sample is encoded in single line after stack sample + exitWithError("'Hybrid perf sample is corrupted, No LBR sample line"); + } +} + +void PerfScriptReader::writeUnsymbolizedProfile(StringRef Filename) { + std::error_code EC; + raw_fd_ostream OS(Filename, EC, llvm::sys::fs::OF_TextWithCRLF); + if (EC) + exitWithError(EC, Filename); + writeUnsymbolizedProfile(OS); +} + +// Use ordered map to make the output deterministic +using OrderedCounterForPrint = std::map; + +void PerfScriptReader::writeUnsymbolizedProfile(raw_fd_ostream &OS) { + OrderedCounterForPrint OrderedCounters; + for (auto &CI : SampleCounters) { + OrderedCounters[getContextKeyStr(CI.first.getPtr(), Binary)] = &CI.second; + } + + auto SCounterPrinter = [&](RangeSample &Counter, StringRef Separator, + uint32_t Indent) { + OS.indent(Indent); + OS << Counter.size() << "\n"; + for (auto &I : Counter) { + uint64_t Start = I.first.first; + uint64_t End = I.first.second; + + if (UseOffset) { + if (UseLoadableSegmentAsBase) { + Start -= Binary->getFirstLoadableAddress(); + End -= Binary->getFirstLoadableAddress(); + } else { + Start -= Binary->getPreferredBaseAddress(); + End -= Binary->getPreferredBaseAddress(); + } + } + + OS.indent(Indent); + OS << Twine::utohexstr(Start) << Separator << Twine::utohexstr(End) << ":" + << I.second << "\n"; + } + }; + + for (auto &CI : OrderedCounters) { + uint32_t Indent = 0; + if (ProfileIsCS) { + // Context string key + OS << "[" << CI.first << "]\n"; + Indent = 2; + } + + SampleCounter &Counter = *CI.second; + SCounterPrinter(Counter.RangeCounter, "-", Indent); + SCounterPrinter(Counter.BranchCounter, "->", Indent); + } +} + +// Format of input: +// number of entries in RangeCounter +// from_1-to_1:count_1 +// from_2-to_2:count_2 +// ...... +// from_n-to_n:count_n +// number of entries in BranchCounter +// src_1->dst_1:count_1 +// src_2->dst_2:count_2 +// ...... +// src_n->dst_n:count_n +void UnsymbolizedProfileReader::readSampleCounters(TraceStream &TraceIt, + SampleCounter &SCounters) { + auto exitWithErrorForTraceLine = [](TraceStream &TraceIt) { + std::string Msg = TraceIt.isAtEoF() + ? "Invalid raw profile!" + : "Invalid raw profile at line " + + Twine(TraceIt.getLineNumber()).str() + ": " + + TraceIt.getCurrentLine().str(); + exitWithError(Msg); + }; + auto ReadNumber = [&](uint64_t &Num) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + if (TraceIt.getCurrentLine().ltrim().getAsInteger(10, Num)) + exitWithErrorForTraceLine(TraceIt); + TraceIt.advance(); + }; + + auto ReadCounter = [&](RangeSample &Counter, StringRef Separator) { + uint64_t Num = 0; + ReadNumber(Num); + while (Num--) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + StringRef Line = TraceIt.getCurrentLine().ltrim(); + + uint64_t Count = 0; + auto LineSplit = Line.split(":"); + if (LineSplit.second.empty() || LineSplit.second.getAsInteger(10, Count)) + exitWithErrorForTraceLine(TraceIt); + + uint64_t Source = 0; + uint64_t Target = 0; + auto Range = LineSplit.first.split(Separator); + if (Range.second.empty() || Range.first.getAsInteger(16, Source) || + Range.second.getAsInteger(16, Target)) + exitWithErrorForTraceLine(TraceIt); + + if (UseOffset) { + if (UseLoadableSegmentAsBase) { + Source += Binary->getFirstLoadableAddress(); + Target += Binary->getFirstLoadableAddress(); + } else { + Source += Binary->getPreferredBaseAddress(); + Target += Binary->getPreferredBaseAddress(); + } + } + + Counter[{Source, Target}] += Count; + TraceIt.advance(); + } + }; + + ReadCounter(SCounters.RangeCounter, "-"); + ReadCounter(SCounters.BranchCounter, "->"); +} + +void UnsymbolizedProfileReader::readUnsymbolizedProfile(StringRef FileName) { + TraceStream TraceIt(FileName); + while (!TraceIt.isAtEoF()) { + std::shared_ptr Key = + std::make_shared(); + StringRef Line = TraceIt.getCurrentLine(); + // Read context stack for CS profile. + if (Line.starts_with("[")) { + ProfileIsCS = true; + auto I = ContextStrSet.insert(Line.str()); + SampleContext::createCtxVectorFromStr(*I.first, Key->Context); + TraceIt.advance(); + } + auto Ret = + SampleCounters.emplace(Hashable(Key), SampleCounter()); + readSampleCounters(TraceIt, Ret.first->second); + } +} + +void UnsymbolizedProfileReader::parsePerfTraces() { + readUnsymbolizedProfile(PerfTraceFile); +} + +void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, + uint64_t Repeat) { + SampleCounter &Counter = SampleCounters.begin()->second; + uint64_t EndAddress = 0; + + if (LeadingIPOnly) { + assert(Sample->LBRStack.size() == 1 && + "Expected only half LBR entries for ip-only mode"); + const LBREntry &LBR = *(Sample->LBRStack.begin()); + uint64_t SourceAddress = LBR.Source; + uint64_t TargetAddress = LBR.Target; + if (SourceAddress == TargetAddress && + Binary->addressIsCode(TargetAddress)) { + Counter.recordRangeCount(SourceAddress, TargetAddress, Repeat); + } + return; + } + + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceAddress = LBR.Source; + uint64_t TargetAddress = LBR.Target; + + // Record the branch if its SourceAddress is external. It can be the case an + // external source call an internal function, later this branch will be used + // to generate the function's head sample. + if (Binary->addressIsCode(TargetAddress)) { + Counter.recordBranchCount(SourceAddress, TargetAddress, Repeat); + } + + // If this not the first LBR, update the range count between TO of current + // LBR and FROM of next LBR. + uint64_t StartAddress = TargetAddress; + if (Binary->addressIsCode(StartAddress) && + Binary->addressIsCode(EndAddress) && + isValidFallThroughRange(StartAddress, EndAddress, Binary)) + Counter.recordRangeCount(StartAddress, EndAddress, Repeat); + EndAddress = SourceAddress; + } +} + +void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + std::shared_ptr Sample = std::make_shared(); + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + warnIfMissingMMap(); + // Record LBR only samples by aggregation + // If a sampling period is given we can adjust the magnitude of sample + // counts to estimate the absolute magnitute. + if (SamplePeriod.getNumOccurrences()) { + Count *= SamplePeriod; + // If counts are LBR-based, as opposed to IP-based, then the magnitude is + // now amplified by roughly the LBR stack size. By adjusting this down, we + // can produce LBR-based and IP-based profiles with comparable magnitudes. + if (!LeadingIPOnly && Sample->LBRStack.size() > 1) + Count /= (Sample->LBRStack.size() - 1); + } + AggregatedSamples[Hashable(Sample)] += Count; + } +} + +void PerfScriptReader::generateUnsymbolizedProfile() { + // There is no context for LBR only sample, so initialize one entry with + // fake "empty" context key. + assert(SampleCounters.empty() && + "Sample counter map should be empty before raw profile generation"); + std::shared_ptr Key = + std::make_shared(); + SampleCounters.emplace(Hashable(Key), SampleCounter()); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + computeCounterFromLBR(Sample, Item.second); + } +} + +uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) { + // The aggregated count is optional, so do not skip the line and return 1 if + // it's unmatched + uint64_t Count = 1; + if (!TraceIt.getCurrentLine().getAsInteger(10, Count)) + TraceIt.advance(); + return Count; +} + +void PerfScriptReader::parseSample(TraceStream &TraceIt) { + NumTotalSample++; + uint64_t Count = parseAggregatedCount(TraceIt); + assert(Count >= 1 && "Aggregated count should be >= 1!"); + parseSample(TraceIt, Count); +} + +bool PerfScriptReader::extractMMapEventForBinary(ProfiledBinary *Binary, + StringRef Line, + MMapEvent &MMap) { + // Parse a MMap2 line like: + // PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0 + // 08:04 19532229 3585508847]: r-xp /usr/lib64/libdl-2.17.so + constexpr static const char *const MMap2Pattern = + "PERF_RECORD_MMAP2 (-?[0-9]+)/[0-9]+: " + "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ " + "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)"; + // Parse a MMap line like + // PERF_RECORD_MMAP -1/0: [0xffffffff81e00000(0x3e8fa000) @ \ + // 0xffffffff81e00000]: x [kernel.kallsyms]_text + constexpr static const char *const MMapPattern = + "PERF_RECORD_MMAP (-?[0-9]+)/[0-9]+: " + "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ " + "(0x[a-f0-9]+|0)\\]: [-a-z]+ (.*)"; + // Field 0 - whole line + // Field 1 - PID + // Field 2 - base address + // Field 3 - mmapped size + // Field 4 - page offset + // Field 5 - binary path + enum EventIndex { + WHOLE_LINE = 0, + PID = 1, + MMAPPED_ADDRESS = 2, + MMAPPED_SIZE = 3, + PAGE_OFFSET = 4, + BINARY_PATH = 5 + }; + + bool R = false; + SmallVector Fields; + if (Line.contains("PERF_RECORD_MMAP2 ")) { + Regex RegMmap2(MMap2Pattern); + R = RegMmap2.match(Line, &Fields); + } else if (Line.contains("PERF_RECORD_MMAP ")) { + Regex RegMmap(MMapPattern); + R = RegMmap.match(Line, &Fields); + } else + llvm_unreachable("unexpected MMAP event entry"); + + if (!R) { + std::string WarningMsg = "Cannot parse mmap event: " + Line.str() + " \n"; + WithColor::warning() << WarningMsg; + return false; + } + long long MMapPID = 0; + getAsSignedInteger(Fields[PID], 10, MMapPID); + MMap.PID = MMapPID; + Fields[MMAPPED_ADDRESS].getAsInteger(0, MMap.Address); + Fields[MMAPPED_SIZE].getAsInteger(0, MMap.Size); + Fields[PAGE_OFFSET].getAsInteger(0, MMap.Offset); + MMap.BinaryPath = Fields[BINARY_PATH]; + if (ShowMmapEvents) { + outs() << "Mmap: Binary " << MMap.BinaryPath << " loaded at " + << format("0x%" PRIx64 ":", MMap.Address) << " \n"; + } + + StringRef BinaryName = filename(MMap.BinaryPath, Binary->isCOFF()); + if (Binary->isKernel()) { + return Binary->isKernelImageName(BinaryName); + } + return Binary->getName() == BinaryName; +} + +void PerfScriptReader::parseMMapEvent(TraceStream &TraceIt) { + MMapEvent MMap; + if (extractMMapEventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) + updateBinaryAddress(MMap); + TraceIt.advance(); +} + +void PerfScriptReader::parseEventOrSample(TraceStream &TraceIt) { + if (isMMapEvent(TraceIt.getCurrentLine())) + parseMMapEvent(TraceIt); + else + parseSample(TraceIt); +} + +void PerfScriptReader::parseAndAggregateTrace() { + // Trace line iterator + TraceStream TraceIt(PerfTraceFile); + while (!TraceIt.isAtEoF()) + parseEventOrSample(TraceIt); +} + +// A LBR sample is like: +// 40062f 0x5c6313f/0x5c63170/P/-/-/0 0x5c630e7/0x5c63130/P/-/-/0 ... +// A heuristic for fast detection by checking whether a +// leading " 0x" and the '/' exist. +bool PerfScriptReader::isLBRSample(StringRef Line) { + // Skip the leading instruction pointer + SmallVector Records; + Line.trim().split(Records, " ", 2, false); + if (Records.size() < 2) + return false; + // Check if there is an event name before the leading IP. + // If there is, it will be in Records[0]. To skip it, we'll re-split on + // Records[1], which should contain the rest of the line. + if (Records[0].contains(":")) { + // If so, consume the event name and continue processing the rest of the + // line. + StringRef IPAndLBR = Records[1].ltrim(); + Records.clear(); + IPAndLBR.split(Records, " ", 2, false); + if (Records.size() < 2) + return false; + } + if (Records[1].starts_with("0x") && Records[1].contains('/')) + return true; + return false; +} + +bool PerfScriptReader::isMMapEvent(StringRef Line) { + // Short cut to avoid string find is possible. + if (Line.empty() || Line.size() < 50) + return false; + + if (std::isdigit(Line[0])) + return false; + + // PERF_RECORD_MMAP2 or PERF_RECORD_MMAP does not appear at the beginning of + // the line for ` perf script --show-mmap-events -i ...` + return Line.contains("PERF_RECORD_MMAP"); +} + +// The raw hybird sample is like +// e.g. +// 4005dc # call stack leaf +// 400634 +// 400684 # call stack root +// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... +// ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +// Determine the perfscript contains hybrid samples(call stack + LBRs) by +// checking whether there is a non-empty call stack immediately followed by +// a LBR sample +PerfContent PerfScriptReader::checkPerfScriptType(StringRef FileName) { + TraceStream TraceIt(FileName); + uint64_t FrameAddr = 0; + while (!TraceIt.isAtEoF()) { + // Skip the aggregated count + if (!TraceIt.getCurrentLine().getAsInteger(10, FrameAddr)) + TraceIt.advance(); + + // Detect sample with call stack + int32_t Count = 0; + while (!TraceIt.isAtEoF() && + !TraceIt.getCurrentLine().ltrim().getAsInteger(16, FrameAddr)) { + Count++; + TraceIt.advance(); + } + if (!TraceIt.isAtEoF()) { + if (isLBRSample(TraceIt.getCurrentLine())) { + if (Count > 0) + return PerfContent::LBRStack; + else + return PerfContent::LBR; + } + TraceIt.advance(); + } + } + + exitWithError("Invalid perf script input!"); + return PerfContent::UnknownContent; +} + +void HybridPerfReader::generateUnsymbolizedProfile() { + ProfileIsCS = !IgnoreStackSamples; + if (ProfileIsCS) + unwindSamples(); + else + PerfScriptReader::generateUnsymbolizedProfile(); +} + +void PerfScriptReader::warnTruncatedStack() { + if (ShowDetailedWarning) { + for (auto Address : InvalidReturnAddresses) { + WithColor::warning() + << "Truncated stack sample due to invalid return address at " + << format("0x%" PRIx64, Address) + << ", likely caused by frame pointer omission\n"; + } + } + emitWarningSummary( + InvalidReturnAddresses.size(), AggregatedSamples.size(), + "of truncated stack samples due to invalid return address, " + "likely caused by frame pointer omission."); +} + +void PerfScriptReader::warnInvalidRange() { + std::unordered_map, uint64_t, + pair_hash> + Ranges; + + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + uint64_t Count = Item.second; + uint64_t EndAddress = 0; + + if (LeadingIPOnly) { + assert(Sample->LBRStack.size() == 1 && + "Expected only half LBR entries for ip-only mode"); + const LBREntry &LBR = *(Sample->LBRStack.begin()); + if (LBR.Source == LBR.Target && LBR.Source != ExternalAddr) { + // This is an leading-addr-only profile. + Ranges[{LBR.Source, LBR.Source}] += Count; + } + continue; + } + + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceAddress = LBR.Source; + uint64_t StartAddress = LBR.Target; + if (EndAddress != 0) + Ranges[{StartAddress, EndAddress}] += Count; + EndAddress = SourceAddress; + } + } + + if (Ranges.empty()) { + WithColor::warning() << "No samples in perf script!\n"; + return; + } + + auto WarnInvalidRange = [&](uint64_t StartAddress, uint64_t EndAddress, + StringRef Msg) { + if (!ShowDetailedWarning) + return; + WithColor::warning() << "[" << format("%8" PRIx64, StartAddress) << "," + << format("%8" PRIx64, EndAddress) << "]: " << Msg + << "\n"; + }; + + const char *EndNotBoundaryMsg = "Range is not on instruction boundary, " + "likely due to profile and binary mismatch."; + const char *DanglingRangeMsg = "Range does not belong to any functions, " + "likely from PLT, .init or .fini section."; + const char *RangeCrossFuncMsg = + "Fall through range should not cross function boundaries, likely due to " + "profile and binary mismatch."; + const char *BogusRangeMsg = "Range start is after or too far from range end."; + + uint64_t TotalRangeNum = 0; + uint64_t InstNotBoundary = 0; + uint64_t UnmatchedRange = 0; + uint64_t RangeCrossFunc = 0; + uint64_t BogusRange = 0; + + for (auto &I : Ranges) { + uint64_t StartAddress = I.first.first; + uint64_t EndAddress = I.first.second; + TotalRangeNum += I.second; + + if (!Binary->addressIsCode(StartAddress) && + !Binary->addressIsCode(EndAddress)) + continue; + + // IP samples can indicate activity on individual instructions rather than + // basic blocks/edges. In this mode, don't warn if sampled IPs aren't + // branches. + if (!LeadingIPOnly) + if (!Binary->addressIsCode(StartAddress) || + !Binary->addressIsTransfer(EndAddress)) { + InstNotBoundary += I.second; + WarnInvalidRange(StartAddress, EndAddress, EndNotBoundaryMsg); + } + + auto *FRange = Binary->findFuncRange(StartAddress); + if (!FRange) { + UnmatchedRange += I.second; + WarnInvalidRange(StartAddress, EndAddress, DanglingRangeMsg); + continue; + } + + if (EndAddress >= FRange->EndAddress) { + RangeCrossFunc += I.second; + WarnInvalidRange(StartAddress, EndAddress, RangeCrossFuncMsg); + } + + if (Binary->addressIsCode(StartAddress) && + Binary->addressIsCode(EndAddress) && + !isValidFallThroughRange(StartAddress, EndAddress, Binary)) { + BogusRange += I.second; + WarnInvalidRange(StartAddress, EndAddress, BogusRangeMsg); + } + } + + emitWarningSummary( + InstNotBoundary, TotalRangeNum, + "of samples are from ranges that are not on instruction boundary."); + emitWarningSummary( + UnmatchedRange, TotalRangeNum, + "of samples are from ranges that do not belong to any functions."); + emitWarningSummary( + RangeCrossFunc, TotalRangeNum, + "of samples are from ranges that do cross function boundaries."); + emitWarningSummary( + BogusRange, TotalRangeNum, + "of samples are from ranges that have range start after or too far from " + "range end acrossing the unconditinal jmp."); +} + +void PerfScriptReader::parsePerfTraces() { + // Parse perf traces and do aggregation. + parseAndAggregateTrace(); + if (Binary->isKernel() && !Binary->getIsLoadedByMMap()) { + exitWithError( + "Kernel is requested, but no kernel is found in mmap events."); + } + + emitWarningSummary(NumLeafExternalFrame, NumTotalSample, + "of samples have leaf external frame in call stack."); + emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample, + "of samples have leading external LBR."); + + // Generate unsymbolized profile. + warnTruncatedStack(); + warnInvalidRange(); + generateUnsymbolizedProfile(); + AggregatedSamples.clear(); + + if (SkipSymbolization) + writeUnsymbolizedProfile(OutputFilename); +} + +SmallVector PerfScriptReader::TempFileCleanups; + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-19.1/PerfReader.h b/tools/ldc-profgen/ldc-profgen-19.1/PerfReader.h new file mode 100644 index 00000000000..a3bd7a0a649 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/PerfReader.h @@ -0,0 +1,748 @@ +//===-- PerfReader.h - perfscript reader -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#include "ErrorHandling.h" +#include "ProfiledBinary.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" +#include +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { + +class CleanupInstaller; + +namespace sampleprof { + +// Stream based trace line iterator +class TraceStream { + std::string CurrentLine; + std::ifstream Fin; + bool IsAtEoF = false; + uint64_t LineNumber = 0; + +public: + TraceStream(StringRef Filename) : Fin(Filename.str()) { + if (!Fin.good()) + exitWithError("Error read input perf script file", Filename); + advance(); + } + + StringRef getCurrentLine() { + assert(!IsAtEoF && "Line iterator reaches the End-of-File!"); + return CurrentLine; + } + + uint64_t getLineNumber() { return LineNumber; } + + bool isAtEoF() { return IsAtEoF; } + + // Read the next line + void advance() { + if (!std::getline(Fin, CurrentLine)) { + IsAtEoF = true; + return; + } + LineNumber++; + } +}; + +// The type of input format. +enum PerfFormat { + UnknownFormat = 0, + PerfData = 1, // Raw linux perf.data. + PerfScript = 2, // Perf script create by `perf script` command. + UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen. + +}; + +// The type of perfscript content. +enum PerfContent { + UnknownContent = 0, + LBR = 1, // Only LBR sample. + LBRStack = 2, // Hybrid sample including call stack and LBR stack. +}; + +struct PerfInputFile { + std::string InputFile; + PerfFormat Format = PerfFormat::UnknownFormat; + PerfContent Content = PerfContent::UnknownContent; +}; + +// The parsed LBR sample entry. +struct LBREntry { + uint64_t Source = 0; + uint64_t Target = 0; + LBREntry(uint64_t S, uint64_t T) : Source(S), Target(T) {} + +#ifndef NDEBUG + void print() const { + dbgs() << "from " << format("%#010x", Source) << " to " + << format("%#010x", Target); + } +#endif +}; + +#ifndef NDEBUG +static inline void printLBRStack(const SmallVectorImpl &LBRStack) { + for (size_t I = 0; I < LBRStack.size(); I++) { + dbgs() << "[" << I << "] "; + LBRStack[I].print(); + dbgs() << "\n"; + } +} + +static inline void printCallStack(const SmallVectorImpl &CallStack) { + for (size_t I = 0; I < CallStack.size(); I++) { + dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n"; + } +} +#endif + +// Hash interface for generic data of type T +// Data should implement a \fn getHashCode and a \fn isEqual +// Currently getHashCode is non-virtual to avoid the overhead of calling vtable, +// i.e we explicitly calculate hash of derived class, assign to base class's +// HashCode. This also provides the flexibility for calculating the hash code +// incrementally(like rolling hash) during frame stack unwinding since unwinding +// only changes the leaf of frame stack. \fn isEqual is a virtual function, +// which will have perf overhead. In the future, if we redesign a better hash +// function, then we can just skip this or switch to non-virtual function(like +// just ignore comparison if hash conflicts probabilities is low) +template class Hashable { +public: + std::shared_ptr Data; + Hashable(const std::shared_ptr &D) : Data(D) {} + + // Hash code generation + struct Hash { + uint64_t operator()(const Hashable &Key) const { + // Don't make it virtual for getHashCode + uint64_t Hash = Key.Data->getHashCode(); + assert(Hash && "Should generate HashCode for it!"); + return Hash; + } + }; + + // Hash equal + struct Equal { + bool operator()(const Hashable &LHS, const Hashable &RHS) const { + // Precisely compare the data, vtable will have overhead. + return LHS.Data->isEqual(RHS.Data.get()); + } + }; + + T *getPtr() const { return Data.get(); } +}; + +struct PerfSample { + // LBR stack recorded in FIFO order. + SmallVector LBRStack; + // Call stack recorded in FILO(leaf to root) order, it's used for CS-profile + // generation + SmallVector CallStack; + + virtual ~PerfSample() = default; + uint64_t getHashCode() const { + // Use simple DJB2 hash + auto HashCombine = [](uint64_t H, uint64_t V) { + return ((H << 5) + H) + V; + }; + uint64_t Hash = 5381; + for (const auto &Value : CallStack) { + Hash = HashCombine(Hash, Value); + } + for (const auto &Entry : LBRStack) { + Hash = HashCombine(Hash, Entry.Source); + Hash = HashCombine(Hash, Entry.Target); + } + return Hash; + } + + bool isEqual(const PerfSample *Other) const { + const SmallVector &OtherCallStack = Other->CallStack; + const SmallVector &OtherLBRStack = Other->LBRStack; + + if (CallStack.size() != OtherCallStack.size() || + LBRStack.size() != OtherLBRStack.size()) + return false; + + if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin())) + return false; + + for (size_t I = 0; I < OtherLBRStack.size(); I++) { + if (LBRStack[I].Source != OtherLBRStack[I].Source || + LBRStack[I].Target != OtherLBRStack[I].Target) + return false; + } + return true; + } + +#ifndef NDEBUG + uint64_t Linenum = 0; + + void print() const { + dbgs() << "Line " << Linenum << "\n"; + dbgs() << "LBR stack\n"; + printLBRStack(LBRStack); + dbgs() << "Call stack\n"; + printCallStack(CallStack); + } +#endif +}; +// After parsing the sample, we record the samples by aggregating them +// into this counter. The key stores the sample data and the value is +// the sample repeat times. +using AggregatedCounter = + std::unordered_map, uint64_t, + Hashable::Hash, Hashable::Equal>; + +using SampleVector = SmallVector, 16>; + +inline bool isValidFallThroughRange(uint64_t Start, uint64_t End, + ProfiledBinary *Binary) { + // Start bigger than End is considered invalid. + // LBR ranges cross the unconditional jmp are also assumed invalid. + // It's found that perf data may contain duplicate LBR entries that could form + // a range that does not reflect real execution flow on some Intel targets, + // e.g. Skylake. Such ranges are ususally very long. Exclude them since there + // cannot be a linear execution range that spans over unconditional jmp. + return Start <= End && !Binary->rangeCrossUncondBranch(Start, End); +} + +// The state for the unwinder, it doesn't hold the data but only keep the +// pointer/index of the data, While unwinding, the CallStack is changed +// dynamicially and will be recorded as the context of the sample +struct UnwindState { + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; + // Call stack trie node + struct ProfiledFrame { + const uint64_t Address = DummyRoot; + ProfiledFrame *Parent; + SampleVector RangeSamples; + SampleVector BranchSamples; + std::unordered_map> Children; + + ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr) + : Address(Addr), Parent(P) {} + ProfiledFrame *getOrCreateChildFrame(uint64_t Address) { + assert(Address && "Address can't be zero!"); + auto Ret = Children.emplace( + Address, std::make_unique(Address, this)); + return Ret.first->second.get(); + } + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) { + RangeSamples.emplace_back(std::make_tuple(Start, End, Count)); + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { + BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); + } + bool isDummyRoot() { return Address == DummyRoot; } + bool isExternalFrame() { return Address == ExternalAddr; } + bool isLeafFrame() { return Children.empty(); } + }; + + ProfiledFrame DummyTrieRoot; + ProfiledFrame *CurrentLeafFrame; + // Used to fall through the LBR stack + uint32_t LBRIndex = 0; + // Reference to PerfSample.LBRStack + const SmallVector &LBRStack; + // Used to iterate the address range + InstructionPointer InstPtr; + // Indicate whether unwinding is currently in a bad state which requires to + // skip all subsequent unwinding. + bool Invalid = false; + UnwindState(const PerfSample *Sample, const ProfiledBinary *Binary) + : Binary(Binary), LBRStack(Sample->LBRStack), + InstPtr(Binary, Sample->CallStack.front()) { + initFrameTrie(Sample->CallStack); + } + + bool validateInitialState() { + uint64_t LBRLeaf = LBRStack[LBRIndex].Target; + uint64_t LeafAddr = CurrentLeafFrame->Address; + assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) && + "External leading LBR should match the leaf frame."); + + // When we take a stack sample, ideally the sampling distance between the + // leaf IP of stack and the last LBR target shouldn't be very large. + // Use a heuristic size (0x100) to filter out broken records. + if (LeafAddr < LBRLeaf || LeafAddr - LBRLeaf >= 0x100) { + WithColor::warning() << "Bogus trace: stack tip = " + << format("%#010x", LeafAddr) + << ", LBR tip = " << format("%#010x\n", LBRLeaf); + return false; + } + return true; + } + + void checkStateConsistency() { + assert(InstPtr.Address == CurrentLeafFrame->Address && + "IP should align with context leaf"); + } + + void setInvalid() { Invalid = true; } + bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } + uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } + uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } + const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } + bool IsLastLBR() const { return LBRIndex == 0; } + bool getLBRStackSize() const { return LBRStack.size(); } + void advanceLBR() { LBRIndex++; } + ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; } + + void pushFrame(uint64_t Address) { + CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address); + } + + void switchToFrame(uint64_t Address) { + if (CurrentLeafFrame->Address == Address) + return; + CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address); + } + + void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; } + + void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; } + + void initFrameTrie(const SmallVectorImpl &CallStack) { + ProfiledFrame *Cur = &DummyTrieRoot; + for (auto Address : reverse(CallStack)) { + Cur = Cur->getOrCreateChildFrame(Address); + } + CurrentLeafFrame = Cur; + } + + ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; } +}; + +// Base class for sample counter key with context +struct ContextKey { + uint64_t HashCode = 0; + virtual ~ContextKey() = default; + uint64_t getHashCode() { + if (HashCode == 0) + genHashCode(); + return HashCode; + } + virtual void genHashCode() = 0; + virtual bool isEqual(const ContextKey *K) const { + return HashCode == K->HashCode; + }; + + // Utilities for LLVM-style RTTI + enum ContextKind { CK_StringBased, CK_AddrBased }; + const ContextKind Kind; + ContextKind getKind() const { return Kind; } + ContextKey(ContextKind K) : Kind(K){}; +}; + +// String based context id +struct StringBasedCtxKey : public ContextKey { + SampleContextFrameVector Context; + + bool WasLeafInlined; + StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_StringBased; + } + + bool isEqual(const ContextKey *K) const override { + const StringBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_value(SampleContextFrames(Context)); + } +}; + +// Address-based context id +struct AddrBasedCtxKey : public ContextKey { + SmallVector Context; + + bool WasLeafInlined; + AddrBasedCtxKey() : ContextKey(CK_AddrBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_AddrBased; + } + + bool isEqual(const ContextKey *K) const override { + const AddrBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_combine_range(Context.begin(), Context.end()); + } +}; + +// The counter of branch samples for one function indexed by the branch, +// which is represented as the source and target offset pair. +using BranchSample = std::map, uint64_t>; +// The counter of range samples for one function indexed by the range, +// which is represented as the start and end offset pair. +using RangeSample = std::map, uint64_t>; +// Wrapper for sample counters including range counter and branch counter +struct SampleCounter { + RangeSample RangeCounter; + BranchSample BranchCounter; + + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) { + assert(Start <= End && "Invalid instruction range"); + RangeCounter[{Start, End}] += Repeat; + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) { + BranchCounter[{Source, Target}] += Repeat; + } +}; + +// Sample counter with context to support context-sensitive profile +using ContextSampleCounterMap = + std::unordered_map, SampleCounter, + Hashable::Hash, Hashable::Equal>; + +struct FrameStack { + SmallVector Stack; + ProfiledBinary *Binary; + FrameStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +struct AddressStack { + SmallVector Stack; + ProfiledBinary *Binary; + AddressStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +/* +As in hybrid sample we have a group of LBRs and the most recent sampling call +stack, we can walk through those LBRs to infer more call stacks which would be +used as context for profile. VirtualUnwinder is the class to do the call stack +unwinding based on LBR state. Two types of unwinding are processd here: +1) LBR unwinding and 2) linear range unwinding. +Specifically, for each LBR entry(can be classified into call, return, regular +branch), LBR unwinding will replay the operation by pushing, popping or +switching leaf frame towards the call stack and since the initial call stack +is most recently sampled, the replay should be in anti-execution order, i.e. for +the regular case, pop the call stack when LBR is call, push frame on call stack +when LBR is return. After each LBR processed, it also needs to align with the +next LBR by going through instructions from previous LBR's target to current +LBR's source, which is the linear unwinding. As instruction from linear range +can come from different function by inlining, linear unwinding will do the range +splitting and record counters by the range with same inline context. Over those +unwinding process we will record each call stack as context id and LBR/linear +range as sample counter for further CS profile generation. +*/ +class VirtualUnwinder { +public: + VirtualUnwinder(ContextSampleCounterMap *Counter, ProfiledBinary *B) + : CtxCounterMap(Counter), Binary(B) {} + bool unwind(const PerfSample *Sample, uint64_t Repeat); + std::set &getUntrackedCallsites() { return UntrackedCallsites; } + + uint64_t NumTotalBranches = 0; + uint64_t NumExtCallBranch = 0; + uint64_t NumMissingExternalFrame = 0; + uint64_t NumMismatchedProEpiBranch = 0; + uint64_t NumMismatchedExtCallBranch = 0; + uint64_t NumUnpairedExtAddr = 0; + uint64_t NumPairedExtAddr = 0; + +private: + bool isSourceExternal(UnwindState &State) const { + return State.getCurrentLBRSource() == ExternalAddr; + } + + bool isTargetExternal(UnwindState &State) const { + return State.getCurrentLBRTarget() == ExternalAddr; + } + + // Determine whether the return source is from external code by checking if + // the target's the next inst is a call inst. + bool isReturnFromExternal(UnwindState &State) const { + return isSourceExternal(State) && + (Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) != 0); + } + + // If the source is external address but it's not the `return` case, treat it + // as a call from external. + bool isCallFromExternal(UnwindState &State) const { + return isSourceExternal(State) && + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) == 0; + } + + bool isCallState(UnwindState &State) const { + // The tail call frame is always missing here in stack sample, we will + // use a specific tail call tracker to infer it. + if (!isValidState(State)) + return false; + + if (Binary->addressIsCall(State.getCurrentLBRSource())) + return true; + + return isCallFromExternal(State); + } + + bool isReturnState(UnwindState &State) const { + if (!isValidState(State)) + return false; + + // Simply check addressIsReturn, as ret is always reliable, both for + // regular call and tail call. + if (Binary->addressIsReturn(State.getCurrentLBRSource())) + return true; + + return isReturnFromExternal(State); + } + + bool isValidState(UnwindState &State) const { return !State.Invalid; } + + void unwindCall(UnwindState &State); + void unwindLinear(UnwindState &State, uint64_t Repeat); + void unwindReturn(UnwindState &State); + void unwindBranch(UnwindState &State); + + template + void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack); + // Collect each samples on trie node by DFS traversal + template + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack); + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur); + + void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, + uint64_t Repeat); + void recordBranchCount(const LBREntry &Branch, UnwindState &State, + uint64_t Repeat); + + ContextSampleCounterMap *CtxCounterMap; + // Profiled binary that current frame address belongs to + ProfiledBinary *Binary; + // Keep track of all untracked callsites + std::set UntrackedCallsites; +}; + +// Read perf trace to parse the events and samples. +class PerfReaderBase { +public: + PerfReaderBase(ProfiledBinary *B, StringRef PerfTrace) + : Binary(B), PerfTraceFile(PerfTrace) { + // Initialize the base address to preferred address. + Binary->setBaseAddress(Binary->getPreferredBaseAddress()); + }; + virtual ~PerfReaderBase() = default; + static std::unique_ptr + create(ProfiledBinary *Binary, PerfInputFile &PerfInput, + std::optional PIDFilter); + + // Entry of the reader to parse multiple perf traces + virtual void parsePerfTraces() = 0; + const ContextSampleCounterMap &getSampleCounters() const { + return SampleCounters; + } + bool profileIsCS() { return ProfileIsCS; } + +protected: + ProfiledBinary *Binary = nullptr; + StringRef PerfTraceFile; + + ContextSampleCounterMap SampleCounters; + bool ProfileIsCS = false; + + uint64_t NumTotalSample = 0; + uint64_t NumLeafExternalFrame = 0; + uint64_t NumLeadingOutgoingLBR = 0; +}; + +// Read perf script to parse the events and samples. +class PerfScriptReader : public PerfReaderBase { +public: + PerfScriptReader(ProfiledBinary *B, StringRef PerfTrace, + std::optional PID) + : PerfReaderBase(B, PerfTrace), PIDFilter(PID) {}; + + // Entry of the reader to parse multiple perf traces + void parsePerfTraces() override; + // Generate perf script from perf data + static PerfInputFile convertPerfDataToTrace(ProfiledBinary *Binary, + bool SkipPID, PerfInputFile &File, + std::optional PIDFilter); + // Extract perf script type by peaking at the input + static PerfContent checkPerfScriptType(StringRef FileName); + + // Cleanup installers for temporary files created by perf script command. + // Those files will be automatically removed when running destructor or + // receiving signals. + static SmallVector TempFileCleanups; + +protected: + // The parsed MMap event + struct MMapEvent { + int64_t PID = 0; + uint64_t Address = 0; + uint64_t Size = 0; + uint64_t Offset = 0; + StringRef BinaryPath; + }; + + // Check whether a given line is LBR sample + static bool isLBRSample(StringRef Line); + // Check whether a given line is MMAP event + static bool isMMapEvent(StringRef Line); + // Parse a single line of a PERF_RECORD_MMAP event looking for a + // mapping between the binary name and its memory layout. + static bool extractMMapEventForBinary(ProfiledBinary *Binary, StringRef Line, + MMapEvent &MMap); + // Update base address based on mmap events + void updateBinaryAddress(const MMapEvent &Event); + // Parse mmap event and update binary address + void parseMMapEvent(TraceStream &TraceIt); + // Parse perf events/samples and do aggregation + void parseAndAggregateTrace(); + // Parse either an MMAP event or a perf sample + void parseEventOrSample(TraceStream &TraceIt); + // Warn if the relevant mmap event is missing. + void warnIfMissingMMap(); + // Emit accumulate warnings. + void warnTruncatedStack(); + // Warn if range is invalid. + void warnInvalidRange(); + // Extract call stack from the perf trace lines + bool extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack); + // Extract LBR stack from one perf trace line + bool extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack); + uint64_t parseAggregatedCount(TraceStream &TraceIt); + // Parse one sample from multiple perf lines, override this for different + // sample type + void parseSample(TraceStream &TraceIt); + // An aggregated count is given to indicate how many times the sample is + // repeated. + virtual void parseSample(TraceStream &TraceIt, uint64_t Count){}; + void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat); + // Post process the profile after trace aggregation, we will do simple range + // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample). + virtual void generateUnsymbolizedProfile(); + void writeUnsymbolizedProfile(StringRef Filename); + void writeUnsymbolizedProfile(raw_fd_ostream &OS); + + // Samples with the repeating time generated by the perf reader + AggregatedCounter AggregatedSamples; + // Keep track of all invalid return addresses + std::set InvalidReturnAddresses; + // PID for the process of interest + std::optional PIDFilter; +}; + +/* + The reader of LBR only perf script. + A typical LBR sample is like: + 40062f 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 +*/ +class LBRPerfReader : public PerfScriptReader { +public: + LBRPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + std::optional PID) + : PerfScriptReader(Binary, PerfTrace, PID) {}; + // Parse the LBR only sample. + void parseSample(TraceStream &TraceIt, uint64_t Count) override; +}; + +/* + Hybrid perf script includes a group of hybrid samples(LBRs + call stack), + which is used to generate CS profile. An example of hybrid sample: + 4005dc # call stack leaf + 400634 + 400684 # call stack root + 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +*/ +class HybridPerfReader : public PerfScriptReader { +public: + HybridPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + std::optional PID) + : PerfScriptReader(Binary, PerfTrace, PID) {}; + // Parse the hybrid sample including the call and LBR line + void parseSample(TraceStream &TraceIt, uint64_t Count) override; + void generateUnsymbolizedProfile() override; + +private: + // Unwind the hybrid samples after aggregration + void unwindSamples(); +}; + +/* + Format of unsymbolized profile: + + [frame1 @ frame2 @ ...] # If it's a CS profile + number of entries in RangeCounter + from_1-to_1:count_1 + from_2-to_2:count_2 + ...... + from_n-to_n:count_n + number of entries in BranchCounter + src_1->dst_1:count_1 + src_2->dst_2:count_2 + ...... + src_n->dst_n:count_n + [frame1 @ frame2 @ ...] # Next context + ...... + +Note that non-CS profile doesn't have the empty `[]` context. +*/ +class UnsymbolizedProfileReader : public PerfReaderBase { +public: + UnsymbolizedProfileReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfReaderBase(Binary, PerfTrace){}; + void parsePerfTraces() override; + +private: + void readSampleCounters(TraceStream &TraceIt, SampleCounter &SCounters); + void readUnsymbolizedProfile(StringRef Filename); + + std::unordered_set ContextStrSet; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-19.1/ProfileGenerator.cpp b/tools/ldc-profgen/ldc-profgen-19.1/ProfileGenerator.cpp new file mode 100644 index 00000000000..175556c2220 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/ProfileGenerator.cpp @@ -0,0 +1,1384 @@ +//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "ProfileGenerator.h" +#include "ErrorHandling.h" +#include "MissingFrameInferrer.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include +#include +#include +#include + +cl::opt OutputFilename("output", cl::value_desc("output"), + cl::Required, + cl::desc("Output profile file")); +static cl::alias OutputA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + +static cl::opt OutputFormat( + "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), + cl::values( + clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), + clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), + clEnumValN(SPF_Text, "text", "Text encoding"), + clEnumValN(SPF_GCC, "gcc", + "GCC encoding (only meaningful for -sample)"))); + +static cl::opt UseMD5( + "use-md5", cl::Hidden, + cl::desc("Use md5 to represent function names in the output profile (only " + "meaningful for -extbinary)")); + +static cl::opt PopulateProfileSymbolList( + "populate-profile-symbol-list", cl::init(false), cl::Hidden, + cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); + +static cl::opt FillZeroForAllFuncs( + "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, + cl::desc("Attribute all functions' range with zero count " + "even it's not hit by any samples.")); + +static cl::opt RecursionCompression( + "compress-recursion", + cl::desc("Compressing recursion by deduplicating adjacent frame " + "sequences up to the specified size. -1 means no size limit."), + cl::Hidden, + cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); + +static cl::opt + TrimColdProfile("trim-cold-profile", + cl::desc("If the total count of the profile is smaller " + "than threshold, it will be trimmed.")); + +static cl::opt CSProfMergeColdContext( + "csprof-merge-cold-context", cl::init(true), + cl::desc("If the total count of context profile is smaller than " + "the threshold, it will be merged into context-less base " + "profile.")); + +static cl::opt CSProfMaxColdContextDepth( + "csprof-max-cold-context-depth", cl::init(1), + cl::desc("Keep the last K contexts while merging cold profile. 1 means the " + "context-less base profile")); + +static cl::opt CSProfMaxContextDepth( + "csprof-max-context-depth", + cl::desc("Keep the last K contexts while merging profile. -1 means no " + "depth limit."), + cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); + +static cl::opt ProfileDensityThreshold( + "profile-density-threshold", llvm::cl::init(50), + llvm::cl::desc("If the profile density is below the given threshold, it " + "will be suggested to increase the sampling rate."), + llvm::cl::Optional); +static cl::opt ShowDensity("show-density", llvm::cl::init(false), + llvm::cl::desc("show profile density details"), + llvm::cl::Optional); +static cl::opt ProfileDensityCutOffHot( + "profile-density-cutoff-hot", llvm::cl::init(990000), + llvm::cl::desc("Total samples cutoff for functions used to calculate " + "profile density.")); + +static cl::opt UpdateTotalSamples( + "update-total-samples", llvm::cl::init(false), + llvm::cl::desc( + "Update total samples by accumulating all its body samples."), + llvm::cl::Optional); + +static cl::opt GenCSNestedProfile( + "gen-cs-nested-profile", cl::Hidden, cl::init(true), + cl::desc("Generate nested function profiles for CSSPGO")); + +cl::opt InferMissingFrames( + "infer-missing-frames", llvm::cl::init(true), + llvm::cl::desc( + "Infer missing call frames due to compiler tail call elimination."), + llvm::cl::Optional); + +extern cl::opt LeadingIPOnly; + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +extern cl::opt ProfileSummaryCutoffHot; +extern cl::opt UseContextLessSummary; + +namespace sampleprof { + +// Initialize the MaxCompressionSize to -1 which means no size limit +int32_t CSProfileGenerator::MaxCompressionSize = -1; + +int CSProfileGenerator::MaxContextDepth = -1; + +bool ProfileGeneratorBase::UseFSDiscriminator = false; + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, + const ContextSampleCounterMap *SampleCounters, + bool ProfileIsCS) { + std::unique_ptr Generator; + if (ProfileIsCS) { + Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); + } else { + Generator.reset(new ProfileGenerator(Binary, SampleCounters)); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles, + bool ProfileIsCS) { + std::unique_ptr Generator; + if (ProfileIsCS) { + Generator.reset(new CSProfileGenerator(Binary, Profiles)); + } else { + Generator.reset(new ProfileGenerator(Binary, std::move(Profiles))); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +void ProfileGeneratorBase::write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap) { + // Populate profile symbol list if extended binary format is used. + ProfileSymbolList SymbolList; + + if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { + Binary->populateSymbolListFromDWARF(SymbolList); + Writer->setProfileSymbolList(&SymbolList); + } + + if (std::error_code EC = Writer->write(ProfileMap)) + exitWithError(std::move(EC)); +} + +void ProfileGeneratorBase::write() { + auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); + if (std::error_code EC = WriterOrErr.getError()) + exitWithError(EC, OutputFilename); + + if (UseMD5) { + if (OutputFormat != SPF_Ext_Binary) + WithColor::warning() << "-use-md5 is ignored. Specify " + "--format=extbinary to enable it\n"; + else + WriterOrErr.get()->setUseMD5(); + } + + write(std::move(WriterOrErr.get()), ProfileMap); +} + +void ProfileGeneratorBase::showDensitySuggestion(double Density) { + if (Density == 0.0) + WithColor::warning() << "The output profile is empty or the " + "--profile-density-cutoff-hot option is " + "set too low. Please check your command.\n"; + else if (Density < ProfileDensityThreshold) + WithColor::warning() + << "Sample PGO is estimated to optimize better with " + << format("%.1f", ProfileDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + + if (ShowDensity) + outs() << "Functions with density >= " << format("%.1f", Density) + << " account for " + << format("%.2f", + static_cast(ProfileDensityCutOffHot) / 10000) + << "% total sample counts.\n"; +} + +bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) { + for (const auto &Prefix : FuncPrefixsToFilter) { + if (FS.getFuncName().starts_with(Prefix)) + return true; + } + + // Filter the function profiles for the inlinees. It's useful for fuzzy + // profile matching which flattens the profile and inlinees' samples are + // merged into top-level function. + for (auto &Callees : + const_cast(FS.getCallsiteSamples())) { + auto &CalleesMap = Callees.second; + for (auto I = CalleesMap.begin(); I != CalleesMap.end();) { + auto FS = I++; + if (filterAmbiguousProfile(FS->second)) + CalleesMap.erase(FS); + } + } + return false; +} + +// For built-in local initialization function such as __cxx_global_var_init, +// __tls_init prefix function, there could be multiple versions of the functions +// in the final binary. However, in the profile generation, we call +// getCanonicalFnName to canonicalize the names which strips the suffixes. +// Therefore, samples from different functions queries the same profile and the +// samples are merged. As the functions are essentially different, entries of +// the merged profile are ambiguous. In sample loader, the IR from one version +// would be attributed towards a merged entries, which is inaccurate. Especially +// for fuzzy profile matching, it gets multiple callsites(from different +// function) but used to match one callsite, which misleads the matching and +// causes a lot of false positives report. Hence, we want to filter them out +// from the profile map during the profile generation time. The profiles are all +// cold functions, it won't have perf impact. +void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) { + for (auto I = ProfileMap.begin(); I != ProfileMap.end();) { + auto FS = I++; + if (filterAmbiguousProfile(FS->second)) + ProfileMap.erase(FS); + } +} + +void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges) { + + /* + Regions may overlap with each other. Using the boundary info, find all + disjoint ranges and their sample count. BoundaryPoint contains the count + multiple samples begin/end at this points. + + |<--100-->| Sample1 + |<------200------>| Sample2 + A B C + + In the example above, + Sample1 begins at A, ends at B, its value is 100. + Sample2 beings at A, ends at C, its value is 200. + For A, BeginCount is the sum of sample begins at A, which is 300 and no + samples ends at A, so EndCount is 0. + Then boundary points A, B, and C with begin/end counts are: + A: (300, 0) + B: (0, 100) + C: (0, 200) + */ + struct BoundaryPoint { + // Sum of sample counts beginning at this point + uint64_t BeginCount = UINT64_MAX; + // Sum of sample counts ending at this point + uint64_t EndCount = UINT64_MAX; + // Is the begin point of a zero range. + bool IsZeroRangeBegin = false; + // Is the end point of a zero range. + bool IsZeroRangeEnd = false; + + void addBeginCount(uint64_t Count) { + if (BeginCount == UINT64_MAX) + BeginCount = 0; + BeginCount += Count; + } + + void addEndCount(uint64_t Count) { + if (EndCount == UINT64_MAX) + EndCount = 0; + EndCount += Count; + } + }; + + /* + For the above example. With boundary points, follwing logic finds two + disjoint region of + + [A,B]: 300 + [B+1,C]: 200 + + If there is a boundary point that both begin and end, the point itself + becomes a separate disjoint region. For example, if we have original + ranges of + + |<--- 100 --->| + |<--- 200 --->| + A B C + + there are three boundary points with their begin/end counts of + + A: (100, 0) + B: (200, 100) + C: (0, 200) + + the disjoint ranges would be + + [A, B-1]: 100 + [B, B]: 300 + [B+1, C]: 200. + + Example for zero value range: + + |<--- 100 --->| + |<--- 200 --->| + |<--------------- 0 ----------------->| + A B C D E F + + [A, B-1] : 0 + [B, C] : 100 + [C+1, D-1]: 0 + [D, E] : 200 + [E+1, F] : 0 + */ + std::map Boundaries; + + for (const auto &Item : Ranges) { + assert(Item.first.first <= Item.first.second && + "Invalid instruction range"); + auto &BeginPoint = Boundaries[Item.first.first]; + auto &EndPoint = Boundaries[Item.first.second]; + uint64_t Count = Item.second; + + BeginPoint.addBeginCount(Count); + EndPoint.addEndCount(Count); + if (Count == 0) { + BeginPoint.IsZeroRangeBegin = true; + EndPoint.IsZeroRangeEnd = true; + } + } + + // Use UINT64_MAX to indicate there is no existing range between BeginAddress + // and the next valid address + uint64_t BeginAddress = UINT64_MAX; + int ZeroRangeDepth = 0; + uint64_t Count = 0; + for (const auto &Item : Boundaries) { + uint64_t Address = Item.first; + const BoundaryPoint &Point = Item.second; + if (Point.BeginCount != UINT64_MAX) { + if (BeginAddress != UINT64_MAX) + DisjointRanges[{BeginAddress, Address - 1}] = Count; + Count += Point.BeginCount; + BeginAddress = Address; + ZeroRangeDepth += Point.IsZeroRangeBegin; + } + if (Point.EndCount != UINT64_MAX) { + assert((BeginAddress != UINT64_MAX) && + "First boundary point cannot be 'end' point"); + DisjointRanges[{BeginAddress, Address}] = Count; + assert(Count >= Point.EndCount && "Mismatched live ranges"); + Count -= Point.EndCount; + BeginAddress = Address + 1; + ZeroRangeDepth -= Point.IsZeroRangeEnd; + // If the remaining count is zero and it's no longer in a zero range, this + // means we consume all the ranges before, thus mark BeginAddress as + // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: + // [<---- 10 ---->] + // [<---- 20 ---->] + // A B C D + // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't + // have the [B+1, C-1] zero range. + if (Count == 0 && ZeroRangeDepth == 0) + BeginAddress = UINT64_MAX; + } + } +} + +void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( + FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, + uint64_t Count) { + // Use the maximum count of samples with same line location + uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); + + if (LeadingIPOnly) { + // When computing an IP-based profile we take the SUM of counts at the + // location instead of applying duplication factors and taking the MAX. + FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, + Count); + } else { + // Otherwise, use duplication factor to compensate for loop + // unroll/vectorization. Note that this is only needed when we're taking + // MAX of the counts at the location instead of SUM. + Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); + + ErrorOr R = FunctionProfile.findSamplesAt( + LeafLoc.Location.LineOffset, Discriminator); + + uint64_t PreviousCount = R ? R.get() : 0; + if (PreviousCount <= Count) { + FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, + Count - PreviousCount); + } + } +} + +void ProfileGeneratorBase::updateTotalSamples() { + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateTotalSamples(); + } +} + +void ProfileGeneratorBase::updateCallsiteSamples() { + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateCallsiteSamples(); + } +} + +void ProfileGeneratorBase::updateFunctionSamples() { + updateCallsiteSamples(); + + if (UpdateTotalSamples) + updateTotalSamples(); +} + +void ProfileGeneratorBase::collectProfiledFunctions() { + std::unordered_set ProfiledFunctions; + if (collectFunctionsFromRawProfile(ProfiledFunctions)) + Binary->setProfiledFunctions(ProfiledFunctions); + else if (collectFunctionsFromLLVMProfile(ProfiledFunctions)) + Binary->setProfiledFunctions(ProfiledFunctions); + else + llvm_unreachable("Unsupported input profile"); +} + +bool ProfileGeneratorBase::collectFunctionsFromRawProfile( + std::unordered_set &ProfiledFunctions) { + if (!SampleCounters) + return false; + // Go through all the stacks, ranges and branches in sample counters, use + // the start of the range to look up the function it belongs and record the + // function. + for (const auto &CI : *SampleCounters) { + if (const auto *CtxKey = dyn_cast(CI.first.getPtr())) { + for (auto StackAddr : CtxKey->Context) { + if (FuncRange *FRange = Binary->findFuncRange(StackAddr)) + ProfiledFunctions.insert(FRange->Func); + } + } + + for (auto Item : CI.second.RangeCounter) { + uint64_t StartAddress = Item.first.first; + if (FuncRange *FRange = Binary->findFuncRange(StartAddress)) + ProfiledFunctions.insert(FRange->Func); + } + + for (auto Item : CI.second.BranchCounter) { + uint64_t SourceAddress = Item.first.first; + uint64_t TargetAddress = Item.first.second; + if (FuncRange *FRange = Binary->findFuncRange(SourceAddress)) + ProfiledFunctions.insert(FRange->Func); + if (FuncRange *FRange = Binary->findFuncRange(TargetAddress)) + ProfiledFunctions.insert(FRange->Func); + } + } + return true; +} + +bool ProfileGenerator::collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) { + for (const auto &FS : ProfileMap) { + if (auto *Func = Binary->getBinaryFunction(FS.second.getFunction())) + ProfiledFunctions.insert(Func); + } + return true; +} + +bool CSProfileGenerator::collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) { + for (auto *Node : ContextTracker) { + if (!Node->getFuncName().empty()) + if (auto *Func = Binary->getBinaryFunction(Node->getFuncName())) + ProfiledFunctions.insert(Func); + } + return true; +} + +FunctionSamples & +ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) { + SampleContext Context(FuncName); + return ProfileMap.create(Context); +} + +void ProfileGenerator::generateProfile() { + collectProfiledFunctions(); + + if (Binary->usePseudoProbes()) + Binary->decodePseudoProbe(); + + if (SampleCounters) { + if (Binary->usePseudoProbes()) { + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + } + + postProcessProfiles(); +} + +void ProfileGenerator::postProcessProfiles() { + computeSummaryAndThreshold(ProfileMap); + trimColdProfiles(ProfileMap, ColdCountThreshold); + filterAmbiguousProfile(ProfileMap); + calculateAndShowDensity(ProfileMap); +} + +void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold) { + if (!TrimColdProfile) + return; + + // Move cold profiles into a tmp container. + std::vector ColdProfileHashes; + for (const auto &I : ProfileMap) { + if (I.second.getTotalSamples() < ColdCntThreshold) + ColdProfileHashes.emplace_back(I.first); + } + + // Remove the cold profile from ProfileMap. + for (const auto &I : ColdProfileHashes) + ProfileMap.erase(I); +} + +void ProfileGenerator::generateLineNumBasedProfile() { + assert(SampleCounters->size() == 1 && + "Must have one entry for profile generation."); + const SampleCounter &SC = SampleCounters->begin()->second; + // Fill in function body samples + populateBodySamplesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForAllFunctions(SC.BranchCounter); + + updateFunctionSamples(); +} + +void ProfileGenerator::generateProbeBasedProfile() { + assert(SampleCounters->size() == 1 && + "Must have one entry for profile generation."); + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + const SampleCounter &SC = SampleCounters->begin()->second; + // Fill in function body samples + populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter); + + updateFunctionSamples(); +} + +void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions( + const RangeSample &RangeCounter) { + ProbeCounterMap ProbeCounter; + // preprocessRangeCounter returns disjoint ranges, so no longer to redo it + // inside extractProbesFromRange. + extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter, + false); + + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + SampleContextFrameVector FrameVec; + Binary->getInlineContextForProbe(Probe, FrameVec, true); + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, Count); + FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(), + Count); + if (Probe->isEntry()) + FunctionProfile.addHeadSamples(Count); + } +} + +void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + + // Record called target sample and its count. + SampleContextFrameVector FrameVec; + Binary->getInlineContextForProbe(CallProbe, FrameVec, true); + + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + FrameVec.back().Location.Discriminator, + FunctionId(CalleeName), Count); + } + } +} + +FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( + const SampleContextFrameVector &FrameVec, uint64_t Count) { + // Get top level profile + FunctionSamples *FunctionProfile = + &getTopLevelFunctionProfile(FrameVec[0].Func); + FunctionProfile->addTotalSamples(Count); + if (Binary->usePseudoProbes()) { + const auto *FuncDesc = Binary->getFuncDescForGUID( + FunctionProfile->getFunction().getHashCode()); + FunctionProfile->setFunctionHash(FuncDesc->FuncHash); + } + + for (size_t I = 1; I < FrameVec.size(); I++) { + LineLocation Callsite( + FrameVec[I - 1].Location.LineOffset, + getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); + FunctionSamplesMap &SamplesMap = + FunctionProfile->functionSamplesAt(Callsite); + auto Ret = + SamplesMap.emplace(FrameVec[I].Func, FunctionSamples()); + if (Ret.second) { + SampleContext Context(FrameVec[I].Func); + Ret.first->second.setContext(Context); + } + FunctionProfile = &Ret.first->second; + FunctionProfile->addTotalSamples(Count); + if (Binary->usePseudoProbes()) { + const auto *FuncDesc = Binary->getFuncDescForGUID( + FunctionProfile->getFunction().getHashCode()); + FunctionProfile->setFunctionHash(FuncDesc->FuncHash); + } + } + + return *FunctionProfile; +} + +RangeSample +ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { + RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); + if (FillZeroForAllFuncs) { + for (auto &FuncI : Binary->getAllBinaryFunctions()) { + for (auto &R : FuncI.second.Ranges) { + Ranges[{R.first, R.second - 1}] += 0; + } + } + } else { + // For each range, we search for all ranges of the function it belongs to + // and initialize it with zero count, so it remains zero if doesn't hit any + // samples. This is to be consistent with compiler that interpret zero count + // as unexecuted(cold). + for (const auto &I : RangeCounter) { + uint64_t StartAddress = I.first.first; + for (const auto &Range : Binary->getRanges(StartAddress)) + Ranges[{Range.first, Range.second - 1}] += 0; + } + } + RangeSample DisjointRanges; + findDisjointRanges(DisjointRanges, Ranges); + return DisjointRanges; +} + +void ProfileGenerator::populateBodySamplesForAllFunctions( + const RangeSample &RangeCounter) { + for (const auto &Range : preprocessRangeCounter(RangeCounter)) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const SampleContextFrameVector FrameVec = + Binary->getFrameLocationStack(IP.Address); + if (!FrameVec.empty()) { + // FIXME: As accumulating total count per instruction caused some + // regression, we changed to accumulate total count per byte as a + // workaround. Tuning hotness threshold on the compiler side might be + // necessary in the future. + FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( + FrameVec, Count * Binary->getInstSize(IP.Address)); + updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), + Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +StringRef +ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) { + // Get the function range by branch target if it's a call branch. + auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress); + + // We won't accumulate sample count for a range whose start is not the real + // function entry such as outlined function or inner labels. + if (!FRange || !FRange->IsFuncEntry) + return StringRef(); + + return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); +} + +void ProfileGenerator::populateBoundarySamplesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + // Record called target sample and its count. + const SampleContextFrameVector &FrameVec = + Binary->getCachedFrameLocationStack(SourceAddress); + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + getBaseDiscriminator(FrameVec.back().Location.Discriminator), + FunctionId(CalleeName), Count); + } + // Add head samples for callee. + FunctionSamples &CalleeProfile = + getTopLevelFunctionProfile(FunctionId(CalleeName)); + CalleeProfile.addHeadSamples(Count); + } +} + +void ProfileGeneratorBase::calculateBodySamplesAndSize( + const FunctionSamples &FSamples, uint64_t &TotalBodySamples, + uint64_t &FuncBodySize) { + // Note that ideally the size should be the number of function instruction. + // However, for probe-based profile, we don't have the accurate instruction + // count for each probe, instead, the probe sample is the samples count for + // the block, which is equivelant to + // total_instruction_samples/num_of_instruction in one block. Hence, we use + // the number of probe as a proxy for the function's size. + FuncBodySize += FSamples.getBodySamples().size(); + + // The accumulated body samples re-calculated here could be different from the + // TotalSamples(getTotalSamples) field of FunctionSamples for line-number + // based profile. The reason is that TotalSamples is the sum of all the + // samples of the machine instruction in one source-code line, however, the + // entry of Bodysamples is the only max number of them, so the TotalSamples is + // usually much bigger than the accumulated body samples as one souce-code + // line can emit many machine instructions. We observed a regression when we + // switched to use the accumulated body samples(by using + // -update-total-samples). Hence, it's safer to re-calculate here to avoid + // such discrepancy. There is no problem for probe-based profile, as the + // TotalSamples is exactly the same as the accumulated body samples. + for (const auto &I : FSamples.getBodySamples()) + TotalBodySamples += I.second.getSamples(); + + for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) + for (const auto &Callee : CallsiteSamples.second) { + // For binary-level density, the inlinees' samples and size should be + // included in the calculation. + calculateBodySamplesAndSize(Callee.second, TotalBodySamples, + FuncBodySize); + } +} + +// Calculate Profile-density: +// Calculate the density for each function and sort them in descending order, +// keep accumulating their total samples unitl it exceeds the +// percentage_threshold(cut-off) of total profile samples, the profile-density +// is the last(minimum) function-density of the processed functions, which means +// all the functions hot to perf are on good density if the profile-density is +// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable +// depending on how much regression the system want to tolerate. +double +ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) { + double ProfileDensity = 0.0; + + uint64_t TotalProfileSamples = 0; + // A list of the function profile density and its total samples. + std::vector> FuncDensityList; + for (const auto &I : Profiles) { + uint64_t TotalBodySamples = 0; + uint64_t FuncBodySize = 0; + calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize); + + if (FuncBodySize == 0) + continue; + + double FuncDensity = static_cast(TotalBodySamples) / FuncBodySize; + TotalProfileSamples += TotalBodySamples; + FuncDensityList.emplace_back(FuncDensity, TotalBodySamples); + } + + // Sorted by the density in descending order. + llvm::stable_sort(FuncDensityList, [&](const std::pair &A, + const std::pair &B) { + if (A.first != B.first) + return A.first > B.first; + return A.second < B.second; + }); + + uint64_t AccumulatedSamples = 0; + uint32_t I = 0; + assert(ProfileDensityCutOffHot <= 1000000 && + "The cutoff value is greater than 1000000(100%)"); + while (AccumulatedSamples < TotalProfileSamples * + static_cast(ProfileDensityCutOffHot) / + 1000000 && + I < FuncDensityList.size()) { + AccumulatedSamples += FuncDensityList[I].second; + ProfileDensity = FuncDensityList[I].first; + I++; + } + + return ProfileDensity; +} + +void ProfileGeneratorBase::calculateAndShowDensity( + const SampleProfileMap &Profiles) { + double Density = calculateDensity(Profiles); + showDensitySuggestion(Density); +} + +FunctionSamples * +CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode, + bool WasLeafInlined) { + FunctionSamples *FProfile = ContextNode->getFunctionSamples(); + if (!FProfile) { + FSamplesList.emplace_back(); + FProfile = &FSamplesList.back(); + FProfile->setFunction(ContextNode->getFuncName()); + ContextNode->setFunctionSamples(FProfile); + } + // Update ContextWasInlined attribute for existing contexts. + // The current function can be called in two ways: + // - when processing a probe of the current frame + // - when processing the entry probe of an inlinee's frame, which + // is then used to update the callsite count of the current frame. + // The two can happen in any order, hence here we are making sure + // `ContextWasInlined` is always set as expected. + // TODO: Note that the former does not always happen if no probes of the + // current frame has samples, and if the latter happens, we could lose the + // attribute. This should be fixed. + if (WasLeafInlined) + FProfile->getContext().setAttribute(ContextWasInlined); + return FProfile; +} + +ContextTrieNode * +CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context, + bool WasLeafInlined) { + ContextTrieNode *ContextNode = + ContextTracker.getOrCreateContextPath(Context, true); + getOrCreateFunctionSamples(ContextNode, WasLeafInlined); + return ContextNode; +} + +void CSProfileGenerator::generateProfile() { + FunctionSamples::ProfileIsCS = true; + + collectProfiledFunctions(); + + if (Binary->usePseudoProbes()) { + Binary->decodePseudoProbe(); + if (InferMissingFrames) + initializeMissingFrameInferrer(); + } + + if (SampleCounters) { + if (Binary->usePseudoProbes()) { + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + } + + if (Binary->getTrackFuncContextSize()) + computeSizeForProfiledFunctions(); + + postProcessProfiles(); +} + +void CSProfileGenerator::initializeMissingFrameInferrer() { + Binary->getMissingContextInferrer()->initialize(SampleCounters); +} + +void CSProfileGenerator::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + Binary->inferMissingFrames(Context, NewContext); +} + +void CSProfileGenerator::computeSizeForProfiledFunctions() { + for (auto *Func : Binary->getProfiledFunctions()) + Binary->computeInlinedContextSizeForFunc(Func); + + // Flush the symbolizer to save memory. + Binary->flushSymbolizer(); +} + +void CSProfileGenerator::updateFunctionSamples() { + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + if (UpdateTotalSamples) + FSamples->updateTotalSamples(); + FSamples->updateCallsiteSamples(); + } + } +} + +void CSProfileGenerator::generateLineNumBasedProfile() { + for (const auto &CI : *SampleCounters) { + const auto *CtxKey = cast(CI.first.getPtr()); + + ContextTrieNode *ContextNode = &getRootContext(); + // Sample context will be empty if the jump is an external-to-internal call + // pattern, the head samples should be added for the internal function. + if (!CtxKey->Context.empty()) { + // Get or create function profile for the range + ContextNode = + getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined); + // Fill in function body samples + populateBodySamplesForFunction(*ContextNode->getFunctionSamples(), + CI.second.RangeCounter); + } + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter); + } + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(getRootContext()); + + updateFunctionSamples(); +} + +void CSProfileGenerator::populateBodySamplesForFunction( + FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { + // Compute disjoint ranges first, so we can use MAX + // for calculating count for each location. + RangeSample Ranges; + findDisjointRanges(Ranges, RangeCounter); + for (const auto &Range : Ranges) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address); + if (LeafLoc) { + // Recording body sample for this specific context + updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); + FunctionProfile.addTotalSamples(Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +void CSProfileGenerator::populateBoundarySamplesForFunction( + ContextTrieNode *Node, const BranchSample &BranchCounters) { + + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + + ContextTrieNode *CallerNode = Node; + LineLocation CalleeCallSite(0, 0); + if (CallerNode != &getRootContext()) { + // Record called target sample and its count + auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress); + if (LeafLoc) { + CallerNode->getFunctionSamples()->addCalledTargetSamples( + LeafLoc->Location.LineOffset, + getBaseDiscriminator(LeafLoc->Location.Discriminator), + FunctionId(CalleeName), + Count); + // Record head sample for called target(callee) + CalleeCallSite = LeafLoc->Location; + } + } + + ContextTrieNode *CalleeNode = + CallerNode->getOrCreateChildContext(CalleeCallSite, + FunctionId(CalleeName)); + FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode); + CalleeProfile->addHeadSamples(Count); + } +} + +void CSProfileGenerator::populateInferredFunctionSamples( + ContextTrieNode &Node) { + // There is no call jmp sample between the inliner and inlinee, we need to use + // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s + // sample depends on child(inlinee)'s sample, so traverse the tree in + // post-order. + for (auto &It : Node.getAllChildContext()) + populateInferredFunctionSamples(It.second); + + FunctionSamples *CalleeProfile = Node.getFunctionSamples(); + if (!CalleeProfile) + return; + // If we already have head sample counts, we must have value profile + // for call sites added already. Skip to avoid double counting. + if (CalleeProfile->getHeadSamples()) + return; + ContextTrieNode *CallerNode = Node.getParentContext(); + // If we don't have context, nothing to do for caller's call site. + // This could happen for entry point function. + if (CallerNode == &getRootContext()) + return; + + LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc(); + FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode); + // Since we don't have call count for inlined functions, we + // estimate it from inlinee's profile using entry body sample. + uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate(); + // If we don't have samples with location, use 1 to indicate live. + if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size()) + EstimatedCallCount = 1; + CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset, + CallerLeafFrameLoc.Discriminator, + Node.getFuncName(), EstimatedCallCount); + CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset, + CallerLeafFrameLoc.Discriminator, + EstimatedCallCount); + CallerProfile.addTotalSamples(EstimatedCallCount); +} + +void CSProfileGenerator::convertToProfileMap( + ContextTrieNode &Node, SampleContextFrameVector &Context) { + FunctionSamples *FProfile = Node.getFunctionSamples(); + if (FProfile) { + Context.emplace_back(Node.getFuncName(), LineLocation(0, 0)); + // Save the new context for future references. + SampleContextFrames NewContext = *Contexts.insert(Context).first; + auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile)); + FunctionSamples &NewProfile = Ret.first->second; + NewProfile.getContext().setContext(NewContext); + Context.pop_back(); + } + + for (auto &It : Node.getAllChildContext()) { + ContextTrieNode &ChildNode = It.second; + Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc()); + convertToProfileMap(ChildNode, Context); + Context.pop_back(); + } +} + +void CSProfileGenerator::convertToProfileMap() { + assert(ProfileMap.empty() && + "ProfileMap should be empty before converting from the trie"); + assert(IsProfileValidOnTrie && + "Do not convert the trie twice, it's already destroyed"); + + SampleContextFrameVector Context; + for (auto &It : getRootContext().getAllChildContext()) + convertToProfileMap(It.second, Context); + + IsProfileValidOnTrie = false; +} + +void CSProfileGenerator::postProcessProfiles() { + // Compute hot/cold threshold based on profile. This will be used for cold + // context profile merging/trimming. + computeSummaryAndThreshold(); + + // Run global pre-inliner to adjust/merge context profile based on estimated + // inline decisions. + if (EnableCSPreInliner) { + ContextTracker.populateFuncToCtxtMap(); + CSPreInliner(ContextTracker, *Binary, Summary.get()).run(); + // Turn off the profile merger by default unless it is explicitly enabled. + if (!CSProfMergeColdContext.getNumOccurrences()) + CSProfMergeColdContext = false; + } + + convertToProfileMap(); + + // Trim and merge cold context profile using cold threshold above. + if (TrimColdProfile || CSProfMergeColdContext) { + SampleContextTrimmer(ProfileMap) + .trimAndMergeColdContextProfiles( + HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, + CSProfMaxColdContextDepth, EnableCSPreInliner); + } + + if (GenCSNestedProfile) { + ProfileConverter CSConverter(ProfileMap); + CSConverter.convertCSProfiles(); + FunctionSamples::ProfileIsCS = false; + } + filterAmbiguousProfile(ProfileMap); + ProfileGeneratorBase::calculateAndShowDensity(ProfileMap); +} + +void ProfileGeneratorBase::computeSummaryAndThreshold( + SampleProfileMap &Profiles) { + SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); + Summary = Builder.computeSummaryForProfiles(Profiles); + HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( + (Summary->getDetailedSummary())); + ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); +} + +void CSProfileGenerator::computeSummaryAndThreshold() { + // Always merge and use context-less profile map to compute summary. + SampleProfileMap ContextLessProfiles; + ContextTracker.createContextLessProfileMap(ContextLessProfiles); + + // Set the flag below to avoid merging the profile again in + // computeSummaryAndThreshold + FunctionSamples::ProfileIsCS = false; + assert( + (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) && + "Don't set --profile-summary-contextless to false for profile " + "generation"); + ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles); + // Recover the old value. + FunctionSamples::ProfileIsCS = true; +} + +void ProfileGeneratorBase::extractProbesFromRange( + const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter, + bool FindDisjointRanges) { + const RangeSample *PRanges = &RangeCounter; + RangeSample Ranges; + if (FindDisjointRanges) { + findDisjointRanges(Ranges, RangeCounter); + PRanges = &Ranges; + } + + for (const auto &Range : *PRanges) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const AddressProbesMap &Address2ProbesMap = + Binary->getAddress2ProbesMap(); + auto It = Address2ProbesMap.find(IP.Address); + if (It != Address2ProbesMap.end()) { + for (const auto &Probe : It->second) { + ProbeCounter[&Probe] += Count; + } + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +static void extractPrefixContextStack(SampleContextFrameVector &ContextStack, + const SmallVectorImpl &AddrVec, + ProfiledBinary *Binary) { + SmallVector Probes; + for (auto Address : reverse(AddrVec)) { + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(Address); + // These could be the cases when a probe is not found at a calliste. Cutting + // off the context from here since the inliner will not know how to consume + // a context with unknown callsites. + // 1. for functions that are not sampled when + // --decode-probe-for-profiled-functions-only is on. + // 2. for a merged callsite. Callsite merging may cause the loss of original + // probe IDs. + // 3. for an external callsite. + if (!CallProbe) + break; + Probes.push_back(CallProbe); + } + + std::reverse(Probes.begin(), Probes.end()); + + // Extract context stack for reusing, leaf context stack will be added + // compressed while looking up function profile. + for (const auto *P : Probes) { + Binary->getInlineContextForProbe(P, ContextStack, true); + } +} + +void CSProfileGenerator::generateProbeBasedProfile() { + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + for (const auto &CI : *SampleCounters) { + const AddrBasedCtxKey *CtxKey = + dyn_cast(CI.first.getPtr()); + // Fill in function body samples from probes, also infer caller's samples + // from callee's probe + populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey); + // Fill in boundary samples for a call probe + populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey); + } +} + +void CSProfileGenerator::populateBodySamplesWithProbes( + const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) { + ProbeCounterMap ProbeCounter; + // Extract the top frame probes by looking up each address among the range in + // the Address2ProbeMap + extractProbesFromRange(RangeCounter, ProbeCounter); + std::unordered_map> + FrameSamples; + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (!Probe->isBlock() || Count == 0) + continue; + + ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe); + FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples(); + // Record the current frame and FunctionProfile whenever samples are + // collected for non-danglie probes. This is for reporting all of the + // zero count probes of the frame later. + FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); + FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(), + Count); + FunctionProfile.addTotalSamples(Count); + if (Probe->isEntry()) { + FunctionProfile.addHeadSamples(Count); + // Look up for the caller's function profile + const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); + ContextTrieNode *CallerNode = ContextNode->getParentContext(); + if (InlinerDesc != nullptr && CallerNode != &getRootContext()) { + // Since the context id will be compressed, we have to use callee's + // context id to infer caller's context id to ensure they share the + // same context prefix. + uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset; + uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator; + assert(CallerIndex && + "Inferred caller's location index shouldn't be zero!"); + assert(!CallerDiscriminator && + "Callsite probe should not have a discriminator!"); + FunctionSamples &CallerProfile = + *getOrCreateFunctionSamples(CallerNode); + CallerProfile.setFunctionHash(InlinerDesc->FuncHash); + CallerProfile.addBodySamples(CallerIndex, CallerDiscriminator, Count); + CallerProfile.addTotalSamples(Count); + CallerProfile.addCalledTargetSamples(CallerIndex, CallerDiscriminator, + ContextNode->getFuncName(), Count); + } + } + } + + // Assign zero count for remaining probes without sample hits to + // differentiate from probes optimized away, of which the counts are unknown + // and will be inferred by the compiler. + for (auto &I : FrameSamples) { + for (auto *FunctionProfile : I.second) { + for (auto *Probe : I.first->getProbes()) { + FunctionProfile->addBodySamples(Probe->getIndex(), + Probe->getDiscriminator(), 0); + } + } + } +} + +void CSProfileGenerator::populateBoundarySamplesWithProbes( + const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) { + for (const auto &BI : BranchCounter) { + uint64_t SourceAddress = BI.first.first; + uint64_t TargetAddress = BI.first.second; + uint64_t Count = BI.second; + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + FunctionSamples &FunctionProfile = + getFunctionProfileForLeafProbe(CtxKey, CallProbe); + FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); + FunctionProfile.addTotalSamples(Count); + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), + CallProbe->getDiscriminator(), + FunctionId(CalleeName), Count); + } +} + +ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe( + const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { + + const SmallVectorImpl *PContext = &CtxKey->Context; + SmallVector NewContext; + + if (InferMissingFrames) { + SmallVector Context = CtxKey->Context; + // Append leaf frame for a complete inference. + Context.push_back(LeafProbe->getAddress()); + inferMissingFrames(Context, NewContext); + // Pop out the leaf probe that was pushed in above. + NewContext.pop_back(); + PContext = &NewContext; + } + + SampleContextFrameVector ContextStack; + extractPrefixContextStack(ContextStack, *PContext, Binary); + + // Explicitly copy the context for appending the leaf context + SampleContextFrameVector NewContextStack(ContextStack.begin(), + ContextStack.end()); + Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); + // For leaf inlined context with the top frame, we should strip off the top + // frame's probe id, like: + // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" + auto LeafFrame = NewContextStack.back(); + LeafFrame.Location = LineLocation(0, 0); + NewContextStack.pop_back(); + // Compress the context string except for the leaf frame + CSProfileGenerator::compressRecursionContext(NewContextStack); + CSProfileGenerator::trimContext(NewContextStack); + NewContextStack.push_back(LeafFrame); + + const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); + bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); + ContextTrieNode *ContextNode = + getOrCreateContextNode(NewContextStack, WasLeafInlined); + ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash); + return ContextNode; +} + +FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( + const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { + return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples(); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-19.1/ProfileGenerator.h b/tools/ldc-profgen/ldc-profgen-19.1/ProfileGenerator.h new file mode 100644 index 00000000000..5e36128530c --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/ProfileGenerator.h @@ -0,0 +1,401 @@ +//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#include "CSPreInliner.h" +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/ProfileData/SampleProfWriter.h" +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +using ProbeCounterMap = + std::unordered_map; + +// This base class for profile generation of sample-based PGO. We reuse all +// structures relating to function profiles and profile writers as seen in +// /ProfileData/SampleProf.h. +class ProfileGeneratorBase { + +public: + ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){}; + ProfileGeneratorBase(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : Binary(Binary), SampleCounters(Counters){}; + ProfileGeneratorBase(ProfiledBinary *Binary, + const SampleProfileMap &&Profiles) + : Binary(Binary), ProfileMap(std::move(Profiles)){}; + + virtual ~ProfileGeneratorBase() = default; + static std::unique_ptr + create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters, + bool profileIsCS); + static std::unique_ptr + create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap, + bool profileIsCS); + virtual void generateProfile() = 0; + void write(); + + static uint32_t + getDuplicationFactor(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? 1 + : llvm::DILocation::getDuplicationFactorFromDiscriminator( + Discriminator); + } + + static uint32_t + getBaseDiscriminator(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? Discriminator + : DILocation::getBaseDiscriminatorFromDiscriminator( + Discriminator, /* IsFSDiscriminator */ false); + } + + static bool UseFSDiscriminator; + +protected: + // Use SampleProfileWriter to serialize profile map + void write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap); + /* + For each region boundary point, mark if it is begin or end (or both) of + the region. Boundary points are inclusive. Log the sample count as well + so we can use it when we compute the sample count of each disjoint region + later. Note that there might be multiple ranges with different sample + count that share same begin/end point. We need to accumulate the sample + count for the boundary point for such case, because for the example + below, + + |<--100-->| + |<------200------>| + A B C + + sample count for disjoint region [A,B] would be 300. + */ + void findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges); + + // Go through each address from range to extract the top frame probe by + // looking up in the Address2ProbeMap + void extractProbesFromRange(const RangeSample &RangeCounter, + ProbeCounterMap &ProbeCounter, + bool FindDisjointRanges = true); + + // Helper function for updating body sample for a leaf location in + // FunctionProfile + void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, + const SampleContextFrame &LeafLoc, + uint64_t Count); + + void updateFunctionSamples(); + + void updateTotalSamples(); + + void updateCallsiteSamples(); + + void filterAmbiguousProfile(SampleProfileMap &Profiles); + + bool filterAmbiguousProfile(FunctionSamples &FS); + + StringRef getCalleeNameForAddress(uint64_t TargetAddress); + + void computeSummaryAndThreshold(SampleProfileMap &ProfileMap); + + void calculateBodySamplesAndSize(const FunctionSamples &FSamples, + uint64_t &TotalBodySamples, + uint64_t &FuncBodySize); + + double calculateDensity(const SampleProfileMap &Profiles); + + void calculateAndShowDensity(const SampleProfileMap &Profiles); + + void showDensitySuggestion(double Density); + + void collectProfiledFunctions(); + + bool collectFunctionsFromRawProfile( + std::unordered_set &ProfiledFunctions); + + // Collect profiled Functions for llvm sample profile input. + virtual bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) = 0; + + // List of function prefix to filter out. + static constexpr const char *FuncPrefixsToFilter[] = {"__cxx_global_var_init", + "__tls_init"}; + + // Thresholds from profile summary to answer isHotCount/isColdCount queries. + uint64_t HotCountThreshold; + + uint64_t ColdCountThreshold; + + ProfiledBinary *Binary = nullptr; + + std::unique_ptr Summary; + + // Used by SampleProfileWriter + SampleProfileMap ProfileMap; + + const ContextSampleCounterMap *SampleCounters = nullptr; +}; + +class ProfileGenerator : public ProfileGeneratorBase { + +public: + ProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : ProfileGeneratorBase(Binary, Counters){}; + ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles) + : ProfileGeneratorBase(Binary, std::move(Profiles)){}; + void generateProfile() override; + +private: + void generateLineNumBasedProfile(); + void generateProbeBasedProfile(); + RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); + FunctionSamples &getTopLevelFunctionProfile(FunctionId FuncName); + // Helper function to get the leaf frame's FunctionProfile by traversing the + // inline stack and meanwhile it adds the total samples for each frame's + // function profile. + FunctionSamples & + getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, + uint64_t Count); + void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); + void + populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); + void + populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); + void populateBoundarySamplesWithProbesForAllFunctions( + const BranchSample &BranchCounters); + void postProcessProfiles(); + void trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold); + bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) override; +}; + +class CSProfileGenerator : public ProfileGeneratorBase { +public: + CSProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : ProfileGeneratorBase(Binary, Counters){}; + CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles) + : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){}; + void generateProfile() override; + + // Trim the context stack at a given depth. + template + static void trimContext(SmallVectorImpl &S, int Depth = MaxContextDepth) { + if (Depth < 0 || static_cast(Depth) >= S.size()) + return; + std::copy(S.begin() + S.size() - static_cast(Depth), S.end(), + S.begin()); + S.resize(Depth); + } + + // Remove adjacent repeated context sequences up to a given sequence length, + // -1 means no size limit. Note that repeated sequences are identified based + // on the exact call site, this is finer granularity than function recursion. + template + static void compressRecursionContext(SmallVectorImpl &Context, + int32_t CSize = MaxCompressionSize) { + uint32_t I = 1; + uint32_t HS = static_cast(Context.size() / 2); + uint32_t MaxDedupSize = + CSize == -1 ? HS : std::min(static_cast(CSize), HS); + auto BeginIter = Context.begin(); + // Use an in-place algorithm to save memory copy + // End indicates the end location of current iteration's data + uint32_t End = 0; + // Deduplicate from length 1 to the max possible size of a repeated + // sequence. + while (I <= MaxDedupSize) { + // This is a linear algorithm that deduplicates adjacent repeated + // sequences of size I. The deduplication detection runs on a sliding + // window whose size is 2*I and it keeps sliding the window to deduplicate + // the data inside. Once duplication is detected, deduplicate it by + // skipping the right half part of the window, otherwise just copy back + // the new one by appending them at the back of End pointer(for the next + // iteration). + // + // For example: + // Input: [a1, a2, b1, b2] + // (Added index to distinguish the same char, the origin is [a, a, b, + // b], the size of the dedup window is 2(I = 1) at the beginning) + // + // 1) The initial status is a dummy window[null, a1], then just copy the + // right half of the window(End = 0), then slide the window. + // Result: [a1], a2, b1, b2 (End points to the element right before ], + // after ] is the data of the previous iteration) + // + // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of + // the window i.e the duplication happen. Only slide the window. + // Result: [a1], a2, b1, b2 + // + // 3) Next window is [a2, b1], copy the right half of the window(b1 is + // new) to the End and slide the window. + // Result: [a1, b1], b1, b2 + // + // 4) Next window is [b1, b2], same to 2), skip b2. + // Result: [a1, b1], b1, b2 + // After resize, it will be [a, b] + + // Use pointers like below to do comparison inside the window + // [a b c a b c] + // | | | | | + // LeftBoundary Left Right Left+I Right+I + // A duplication found if Left < LeftBoundry. + + int32_t Right = I - 1; + End = I; + int32_t LeftBoundary = 0; + while (Right + I < Context.size()) { + // To avoids scanning a part of a sequence repeatedly, it finds out + // the common suffix of two hald in the window. The common suffix will + // serve as the common prefix of next possible pair of duplicate + // sequences. The non-common part will be ignored and never scanned + // again. + + // For example. + // Input: [a, b1], c1, b2, c2 + // I = 2 + // + // 1) For the window [a, b1, c1, b2], non-common-suffix for the right + // part is 'c1', copy it and only slide the window 1 step. + // Result: [a, b1, c1], b2, c2 + // + // 2) Next window is [b1, c1, b2, c2], so duplication happen. + // Result after resize: [a, b, c] + + int32_t Left = Right; + while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { + // Find the longest suffix inside the window. When stops, Left points + // at the diverging point in the current sequence. + Left--; + } + + bool DuplicationFound = (Left < LeftBoundary); + // Don't need to recheck the data before Right + LeftBoundary = Right + 1; + if (DuplicationFound) { + // Duplication found, skip right half of the window. + Right += I; + } else { + // Copy the non-common-suffix part of the adjacent sequence. + std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, + BeginIter + End); + End += Left + I - Right; + // Only slide the window by the size of non-common-suffix + Right = Left + I; + } + } + // Don't forget the remaining part that's not scanned. + std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); + End += Context.size() - Right - 1; + I++; + Context.resize(End); + MaxDedupSize = std::min(static_cast(End / 2), MaxDedupSize); + } + } + +private: + void generateLineNumBasedProfile(); + + FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode, + bool WasLeafInlined = false); + + // Lookup or create ContextTrieNode for the context, FunctionSamples is + // created inside this function. + ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context, + bool WasLeafInlined = false); + + // For profiled only functions, on-demand compute their inline context + // function byte size which is used by the pre-inliner. + void computeSizeForProfiledFunctions(); + // Post processing for profiles before writing out, such as mermining + // and trimming cold profiles, running preinliner on profiles. + void postProcessProfiles(); + + void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, + const RangeSample &RangeCounters); + + void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode, + const BranchSample &BranchCounters); + + void populateInferredFunctionSamples(ContextTrieNode &Node); + + void updateFunctionSamples(); + + void generateProbeBasedProfile(); + + // Fill in function body samples from probes + void populateBodySamplesWithProbes(const RangeSample &RangeCounter, + const AddrBasedCtxKey *CtxKey); + // Fill in boundary samples for a call probe + void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, + const AddrBasedCtxKey *CtxKey); + + ContextTrieNode * + getContextNodeForLeafProbe(const AddrBasedCtxKey *CtxKey, + const MCDecodedPseudoProbe *LeafProbe); + + // Helper function to get FunctionSamples for the leaf probe + FunctionSamples & + getFunctionProfileForLeafProbe(const AddrBasedCtxKey *CtxKey, + const MCDecodedPseudoProbe *LeafProbe); + + void convertToProfileMap(ContextTrieNode &Node, + SampleContextFrameVector &Context); + + void convertToProfileMap(); + + void computeSummaryAndThreshold(); + + bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) override; + + void initializeMissingFrameInferrer(); + + // Given an input `Context`, output `NewContext` with inferred missing tail + // call frames. + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + + ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); }; + + // The container for holding the FunctionSamples used by context trie. + std::list FSamplesList; + + // Underlying context table serves for sample profile writer. + std::unordered_set Contexts; + + SampleContextTracker ContextTracker; + + bool IsProfileValidOnTrie = true; + +public: + // Deduplicate adjacent repeated context sequences up to a given sequence + // length. -1 means no size limit. + static int32_t MaxCompressionSize; + static int MaxContextDepth; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-19.1/ProfiledBinary.cpp b/tools/ldc-profgen/ldc-profgen-19.1/ProfiledBinary.cpp new file mode 100644 index 00000000000..632ddc7b50f --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/ProfiledBinary.cpp @@ -0,0 +1,1034 @@ +//===-- ProfiledBinary.cpp - Binary decoder ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfiledBinary.h" +#include "ErrorHandling.h" +#include "MissingFrameInferrer.h" +#include "ProfileGenerator.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Demangle/Demangle.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Object/COFF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/TargetParser/Triple.h" +#include + +#define DEBUG_TYPE "load-binary" + +using namespace llvm; +using namespace sampleprof; + +cl::opt ShowDisassemblyOnly("show-disassembly-only", + cl::desc("Print disassembled code.")); + +cl::opt ShowSourceLocations("show-source-locations", + cl::desc("Print source locations.")); + +static cl::opt + ShowCanonicalFnName("show-canonical-fname", + cl::desc("Print canonical function name.")); + +static cl::opt ShowPseudoProbe( + "show-pseudo-probe", + cl::desc("Print pseudo probe section and disassembled info.")); + +static cl::opt UseDwarfCorrelation( + "use-dwarf-correlation", + cl::desc("Use dwarf for profile correlation even when binary contains " + "pseudo probe.")); + +static cl::opt + DWPPath("dwp", cl::init(""), + cl::desc("Path of .dwp file. When not specified, it will be " + ".dwp in the same directory as the main binary.")); + +static cl::list DisassembleFunctions( + "disassemble-functions", cl::CommaSeparated, + cl::desc("List of functions to print disassembly for. Accept demangled " + "names only. Only work with show-disassembly-only")); + +static cl::opt + KernelBinary("kernel", + cl::desc("Generate the profile for Linux kernel binary.")); + +extern cl::opt ShowDetailedWarning; +extern cl::opt InferMissingFrames; + +namespace llvm { +namespace sampleprof { + +static const Target *getTarget(const ObjectFile *Obj) { + Triple TheTriple = Obj->makeTriple(); + std::string Error; + std::string ArchName; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) + exitWithError(Error, Obj->getFileName()); + return TheTarget; +} + +void BinarySizeContextTracker::addInstructionForContext( + const SampleContextFrameVector &Context, uint32_t InstrSize) { + ContextTrieNode *CurNode = &RootContext; + bool IsLeaf = true; + for (const auto &Callsite : reverse(Context)) { + FunctionId CallerName = Callsite.Func; + LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.Location; + CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName); + IsLeaf = false; + } + + CurNode->addFunctionSize(InstrSize); +} + +uint32_t +BinarySizeContextTracker::getFuncSizeForContext(const ContextTrieNode *Node) { + ContextTrieNode *CurrNode = &RootContext; + ContextTrieNode *PrevNode = nullptr; + + std::optional Size; + + // Start from top-level context-less function, traverse down the reverse + // context trie to find the best/longest match for given context, then + // retrieve the size. + LineLocation CallSiteLoc(0, 0); + while (CurrNode && Node->getParentContext() != nullptr) { + PrevNode = CurrNode; + CurrNode = CurrNode->getChildContext(CallSiteLoc, Node->getFuncName()); + if (CurrNode && CurrNode->getFunctionSize()) + Size = *CurrNode->getFunctionSize(); + CallSiteLoc = Node->getCallSiteLoc(); + Node = Node->getParentContext(); + } + + // If we traversed all nodes along the path of the context and haven't + // found a size yet, pivot to look for size from sibling nodes, i.e size + // of inlinee under different context. + if (!Size) { + if (!CurrNode) + CurrNode = PrevNode; + while (!Size && CurrNode && !CurrNode->getAllChildContext().empty()) { + CurrNode = &CurrNode->getAllChildContext().begin()->second; + if (CurrNode->getFunctionSize()) + Size = *CurrNode->getFunctionSize(); + } + } + + assert(Size && "We should at least find one context size."); + return *Size; +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder) { + ProbeFrameStack ProbeContext; + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) + trackInlineesOptimizedAway(ProbeDecoder, *Child.second, ProbeContext); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, ProbeFrameStack &ProbeContext) { + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName; + ProbeContext.emplace_back(FuncName, 0); + + // This ProbeContext has a probe, so it has code before inlining and + // optimization. Make sure we mark its size as known. + if (!ProbeNode.getProbes().empty()) { + ContextTrieNode *SizeContext = &RootContext; + for (auto &ProbeFrame : reverse(ProbeContext)) { + StringRef CallerName = ProbeFrame.first; + LineLocation CallsiteLoc(ProbeFrame.second, 0); + SizeContext = + SizeContext->getOrCreateChildContext(CallsiteLoc, + FunctionId(CallerName)); + } + // Add 0 size to make known. + SizeContext->addFunctionSize(0); + } + + // DFS down the probe inline tree + for (const auto &ChildNode : ProbeNode.getChildren()) { + InlineSite Location = ChildNode.first; + ProbeContext.back().second = std::get<1>(Location); + trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second, ProbeContext); + } + + ProbeContext.pop_back(); +} + +ProfiledBinary::ProfiledBinary(const StringRef ExeBinPath, + const StringRef DebugBinPath) + : Path(ExeBinPath), DebugBinaryPath(DebugBinPath), + SymbolizerOpts(getSymbolizerOpts()), ProEpilogTracker(this), + Symbolizer(std::make_unique(SymbolizerOpts)), + TrackFuncContextSize(EnableCSPreInliner && UseContextCostForPreInliner) { + // Point to executable binary if debug info binary is not specified. + SymbolizerPath = DebugBinPath.empty() ? ExeBinPath : DebugBinPath; + if (InferMissingFrames) + MissingContextInferrer = std::make_unique(this); + load(); +} + +ProfiledBinary::~ProfiledBinary() {} + +void ProfiledBinary::warnNoFuncEntry() { + uint64_t NoFuncEntryNum = 0; + for (auto &F : BinaryFunctions) { + if (F.second.Ranges.empty()) + continue; + bool hasFuncEntry = false; + for (auto &R : F.second.Ranges) { + if (FuncRange *FR = findFuncRangeForStartAddr(R.first)) { + if (FR->IsFuncEntry) { + hasFuncEntry = true; + break; + } + } + } + + if (!hasFuncEntry) { + NoFuncEntryNum++; + if (ShowDetailedWarning) + WithColor::warning() + << "Failed to determine function entry for " << F.first + << " due to inconsistent name from symbol table and dwarf info.\n"; + } + } + emitWarningSummary(NoFuncEntryNum, BinaryFunctions.size(), + "of functions failed to determine function entry due to " + "inconsistent name from symbol table and dwarf info."); +} + +void ProfiledBinary::load() { + // Attempt to open the binary. + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + + IsCOFF = isa(&ExeBinary); + if (!isa(&ExeBinary) && !IsCOFF) + exitWithError("not a valid ELF/COFF image", Path); + + auto *Obj = cast(&ExeBinary); + TheTriple = Obj->makeTriple(); + + LLVM_DEBUG(dbgs() << "Loading " << Path << "\n"); + + // Mark the binary as a kernel image; + IsKernel = KernelBinary; + + // Find the preferred load address for text sections. + setPreferredTextSegmentAddresses(Obj); + + // Load debug info of subprograms from DWARF section. + // If path of debug info binary is specified, use the debug info from it, + // otherwise use the debug info from the executable binary. + if (!DebugBinaryPath.empty()) { + OwningBinary DebugPath = + unwrapOrError(createBinary(DebugBinaryPath), DebugBinaryPath); + loadSymbolsFromDWARF(*cast(DebugPath.getBinary())); + } else { + loadSymbolsFromDWARF(*cast(&ExeBinary)); + } + + DisassembleFunctionSet.insert(DisassembleFunctions.begin(), + DisassembleFunctions.end()); + + if (auto *ELFObj = dyn_cast(Obj)) { + checkPseudoProbe(ELFObj); + if (UsePseudoProbes) + populateElfSymbolAddressList(ELFObj); + + if (ShowDisassemblyOnly) + decodePseudoProbe(ELFObj); + } + + // Disassemble the text sections. + disassemble(Obj); + + // Use function start and return address to infer prolog and epilog + ProEpilogTracker.inferPrologAddresses(StartAddrToFuncRangeMap); + ProEpilogTracker.inferEpilogAddresses(RetAddressSet); + + warnNoFuncEntry(); + + // TODO: decode other sections. +} + +bool ProfiledBinary::inlineContextEqual(uint64_t Address1, uint64_t Address2) { + const SampleContextFrameVector &Context1 = + getCachedFrameLocationStack(Address1); + const SampleContextFrameVector &Context2 = + getCachedFrameLocationStack(Address2); + if (Context1.size() != Context2.size()) + return false; + if (Context1.empty()) + return false; + // The leaf frame contains location within the leaf, and it + // needs to be remove that as it's not part of the calling context + return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1, + Context2.begin(), Context2.begin() + Context2.size() - 1); +} + +SampleContextFrameVector +ProfiledBinary::getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined) { + SampleContextFrameVector ContextVec; + if (Stack.empty()) + return ContextVec; + // Process from frame root to leaf + for (auto Address : Stack) { + const SampleContextFrameVector &ExpandedContext = + getCachedFrameLocationStack(Address); + // An instruction without a valid debug line will be ignored by sample + // processing + if (ExpandedContext.empty()) + return SampleContextFrameVector(); + // Set WasLeafInlined to the size of inlined frame count for the last + // address which is leaf + WasLeafInlined = (ExpandedContext.size() > 1); + ContextVec.append(ExpandedContext); + } + + // Replace with decoded base discriminator + for (auto &Frame : ContextVec) { + Frame.Location.Discriminator = ProfileGeneratorBase::getBaseDiscriminator( + Frame.Location.Discriminator, UseFSDiscriminator); + } + + assert(ContextVec.size() && "Context length should be at least 1"); + + // Compress the context string except for the leaf frame + auto LeafFrame = ContextVec.back(); + LeafFrame.Location = LineLocation(0, 0); + ContextVec.pop_back(); + CSProfileGenerator::compressRecursionContext(ContextVec); + CSProfileGenerator::trimContext(ContextVec); + ContextVec.push_back(LeafFrame); + return ContextVec; +} + +template +void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile &Obj, + StringRef FileName) { + const auto &PhdrRange = unwrapOrError(Obj.program_headers(), FileName); + // FIXME: This should be the page size of the system running profiling. + // However such info isn't available at post-processing time, assuming + // 4K page now. Note that we don't use EXEC_PAGESIZE from + // because we may build the tools on non-linux. + uint64_t PageSize = 0x1000; + for (const typename ELFT::Phdr &Phdr : PhdrRange) { + if (Phdr.p_type == ELF::PT_LOAD) { + if (!FirstLoadableAddress) + FirstLoadableAddress = Phdr.p_vaddr & ~(PageSize - 1U); + if (Phdr.p_flags & ELF::PF_X) { + // Segments will always be loaded at a page boundary. + PreferredTextSegmentAddresses.push_back(Phdr.p_vaddr & + ~(PageSize - 1U)); + TextSegmentOffsets.push_back(Phdr.p_offset & ~(PageSize - 1U)); + } + } + } + + if (PreferredTextSegmentAddresses.empty()) + exitWithError("no executable segment found", FileName); +} + +void ProfiledBinary::setPreferredTextSegmentAddresses(const COFFObjectFile *Obj, + StringRef FileName) { + uint64_t ImageBase = Obj->getImageBase(); + if (!ImageBase) + exitWithError("Not a COFF image", FileName); + + PreferredTextSegmentAddresses.push_back(ImageBase); + FirstLoadableAddress = ImageBase; + + for (SectionRef Section : Obj->sections()) { + const coff_section *Sec = Obj->getCOFFSection(Section); + if (Sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE) + TextSegmentOffsets.push_back(Sec->VirtualAddress); + } +} + +void ProfiledBinary::setPreferredTextSegmentAddresses(const ObjectFile *Obj) { + if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *COFFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(COFFObj, Obj->getFileName()); + else + llvm_unreachable("invalid object format"); +} + +void ProfiledBinary::checkPseudoProbe(const ELFObjectFileBase *Obj) { + if (UseDwarfCorrelation) + return; + + bool HasProbeDescSection = false; + bool HasPseudoProbeSection = false; + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + if (SectionName == ".pseudo_probe_desc") { + HasProbeDescSection = true; + } else if (SectionName == ".pseudo_probe") { + HasPseudoProbeSection = true; + } + } + + // set UsePseudoProbes flag, used for PerfReader + UsePseudoProbes = HasProbeDescSection && HasPseudoProbeSection; +} + +void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) { + if (!UsePseudoProbes) + return; + + MCPseudoProbeDecoder::Uint64Set GuidFilter; + MCPseudoProbeDecoder::Uint64Map FuncStartAddresses; + if (ShowDisassemblyOnly) { + if (DisassembleFunctionSet.empty()) { + FuncStartAddresses = SymbolStartAddrs; + } else { + for (auto &F : DisassembleFunctionSet) { + auto GUID = Function::getGUID(F.first()); + if (auto StartAddr = SymbolStartAddrs.lookup(GUID)) { + FuncStartAddresses[GUID] = StartAddr; + FuncRange &Range = StartAddrToFuncRangeMap[StartAddr]; + GuidFilter.insert(Function::getGUID(Range.getFuncName())); + } + } + } + } else { + for (auto *F : ProfiledFunctions) { + GuidFilter.insert(Function::getGUID(F->FuncName)); + for (auto &Range : F->Ranges) { + auto GUIDs = StartAddrToSymMap.equal_range(Range.first); + for (auto I = GUIDs.first; I != GUIDs.second; ++I) + FuncStartAddresses[I->second] = I->first; + } + } + } + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (SectionName == ".pseudo_probe_desc") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildGUID2FuncDescMap( + reinterpret_cast(Contents.data()), + Contents.size())) + exitWithError( + "Pseudo Probe decoder fail in .pseudo_probe_desc section"); + } else if (SectionName == ".pseudo_probe") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildAddress2ProbeMap( + reinterpret_cast(Contents.data()), + Contents.size(), GuidFilter, FuncStartAddresses)) + exitWithError("Pseudo Probe decoder fail in .pseudo_probe section"); + } + } + + // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe + // is available + if (TrackFuncContextSize) { + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) { + auto *Frame = Child.second.get(); + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName; + TopLevelProbeFrameMap[FuncName] = Frame; + } + } + + if (ShowPseudoProbe) + ProbeDecoder.printGUID2FuncDescMap(outs()); +} + +void ProfiledBinary::decodePseudoProbe() { + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + auto *Obj = cast(&ExeBinary); + decodePseudoProbe(Obj); +} + +void ProfiledBinary::setIsFuncEntry(FuncRange *FuncRange, + StringRef RangeSymName) { + // Skip external function symbol. + if (!FuncRange) + return; + + // Set IsFuncEntry to ture if there is only one range in the function or the + // RangeSymName from ELF is equal to its DWARF-based function name. + if (FuncRange->Func->Ranges.size() == 1 || + (!FuncRange->IsFuncEntry && FuncRange->getFuncName() == RangeSymName)) + FuncRange->IsFuncEntry = true; +} + +bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, + const SectionRef &Section) { + std::size_t SE = Symbols.size(); + uint64_t SectionAddress = Section.getAddress(); + uint64_t SectSize = Section.getSize(); + uint64_t StartAddress = Symbols[SI].Addr; + uint64_t NextStartAddress = + (SI + 1 < SE) ? Symbols[SI + 1].Addr : SectionAddress + SectSize; + FuncRange *FRange = findFuncRange(StartAddress); + setIsFuncEntry(FRange, FunctionSamples::getCanonicalFnName(Symbols[SI].Name)); + StringRef SymbolName = + ShowCanonicalFnName + ? FunctionSamples::getCanonicalFnName(Symbols[SI].Name) + : Symbols[SI].Name; + bool ShowDisassembly = + ShowDisassemblyOnly && (DisassembleFunctionSet.empty() || + DisassembleFunctionSet.count(SymbolName)); + if (ShowDisassembly) + outs() << '<' << SymbolName << ">:\n"; + + uint64_t Address = StartAddress; + // Size of a consecutive invalid instruction range starting from Address -1 + // backwards. + uint64_t InvalidInstLength = 0; + while (Address < NextStartAddress) { + MCInst Inst; + uint64_t Size; + // Disassemble an instruction. + bool Disassembled = DisAsm->getInstruction( + Inst, Size, Bytes.slice(Address - SectionAddress), Address, nulls()); + if (Size == 0) + Size = 1; + + if (ShowDisassembly) { + if (ShowPseudoProbe) { + ProbeDecoder.printProbeForAddress(outs(), Address); + } + outs() << format("%8" PRIx64 ":", Address); + size_t Start = outs().tell(); + if (Disassembled) + IPrinter->printInst(&Inst, Address + Size, "", *STI, outs()); + else + outs() << "\t"; + if (ShowSourceLocations) { + unsigned Cur = outs().tell() - Start; + if (Cur < 40) + outs().indent(40 - Cur); + InstructionPointer IP(this, Address); + outs() << getReversedLocWithContext( + symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe)); + } + outs() << "\n"; + } + + if (Disassembled) { + const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode()); + + // Record instruction size. + AddressToInstSizeMap[Address] = Size; + + // Populate address maps. + CodeAddressVec.push_back(Address); + if (MCDesc.isCall()) { + CallAddressSet.insert(Address); + UncondBranchAddrSet.insert(Address); + } else if (MCDesc.isReturn()) { + RetAddressSet.insert(Address); + UncondBranchAddrSet.insert(Address); + } else if (MCDesc.isBranch()) { + if (MCDesc.isUnconditionalBranch()) + UncondBranchAddrSet.insert(Address); + BranchAddressSet.insert(Address); + } + + // Record potential call targets for tail frame inference later-on. + if (InferMissingFrames && FRange) { + uint64_t Target = 0; + MIA->evaluateBranch(Inst, Address, Size, Target); + if (MCDesc.isCall()) { + // Indirect call targets are unknown at this point. Recording the + // unknown target (zero) for further LBR-based refinement. + MissingContextInferrer->CallEdges[Address].insert(Target); + } else if (MCDesc.isUnconditionalBranch()) { + assert(Target && + "target should be known for unconditional direct branch"); + // Any inter-function unconditional jump is considered tail call at + // this point. This is not 100% accurate and could further be + // optimized based on some source annotation. + FuncRange *ToFRange = findFuncRange(Target); + if (ToFRange && ToFRange->Func != FRange->Func) + MissingContextInferrer->TailCallEdges[Address].insert(Target); + LLVM_DEBUG({ + dbgs() << "Direct Tail call: " << format("%8" PRIx64 ":", Address); + IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), dbgs()); + dbgs() << "\n"; + }); + } else if (MCDesc.isIndirectBranch() && MCDesc.isBarrier()) { + // This is an indirect branch but not necessarily an indirect tail + // call. The isBarrier check is to filter out conditional branch. + // Similar with indirect call targets, recording the unknown target + // (zero) for further LBR-based refinement. + MissingContextInferrer->TailCallEdges[Address].insert(Target); + LLVM_DEBUG({ + dbgs() << "Indirect Tail call: " + << format("%8" PRIx64 ":", Address); + IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), dbgs()); + dbgs() << "\n"; + }); + } + } + + if (InvalidInstLength) { + AddrsWithInvalidInstruction.insert( + {Address - InvalidInstLength, Address - 1}); + InvalidInstLength = 0; + } + } else { + InvalidInstLength += Size; + } + + Address += Size; + } + + if (InvalidInstLength) + AddrsWithInvalidInstruction.insert( + {Address - InvalidInstLength, Address - 1}); + + if (ShowDisassembly) + outs() << "\n"; + + return true; +} + +void ProfiledBinary::setUpDisassembler(const ObjectFile *Obj) { + const Target *TheTarget = getTarget(Obj); + std::string TripleName = TheTriple.getTriple(); + StringRef FileName = Obj->getFileName(); + + MRI.reset(TheTarget->createMCRegInfo(TripleName)); + if (!MRI) + exitWithError("no register info for target " + TripleName, FileName); + + MCTargetOptions MCOptions; + AsmInfo.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + if (!AsmInfo) + exitWithError("no assembly info for target " + TripleName, FileName); + + Expected Features = Obj->getFeatures(); + if (!Features) + exitWithError(Features.takeError(), FileName); + STI.reset( + TheTarget->createMCSubtargetInfo(TripleName, "", Features->getString())); + if (!STI) + exitWithError("no subtarget info for target " + TripleName, FileName); + + MII.reset(TheTarget->createMCInstrInfo()); + if (!MII) + exitWithError("no instruction info for target " + TripleName, FileName); + + MCContext Ctx(Triple(TripleName), AsmInfo.get(), MRI.get(), STI.get()); + std::unique_ptr MOFI( + TheTarget->createMCObjectFileInfo(Ctx, /*PIC=*/false)); + Ctx.setObjectFileInfo(MOFI.get()); + DisAsm.reset(TheTarget->createMCDisassembler(*STI, Ctx)); + if (!DisAsm) + exitWithError("no disassembler for target " + TripleName, FileName); + + MIA.reset(TheTarget->createMCInstrAnalysis(MII.get())); + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + IPrinter.reset(TheTarget->createMCInstPrinter( + Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI)); + IPrinter->setPrintBranchImmAsAddress(true); +} + +void ProfiledBinary::disassemble(const ObjectFile *Obj) { + // Set up disassembler and related components. + setUpDisassembler(Obj); + + // Create a mapping from virtual address to symbol name. The symbols in text + // sections are the candidates to dissassemble. + std::map AllSymbols; + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + section_iterator SecI = unwrapOrError(Symbol.getSection(), FileName); + if (SecI != Obj->section_end()) + AllSymbols[*SecI].push_back(SymbolInfoTy(Addr, Name, ELF::STT_NOTYPE)); + } + + // Sort all the symbols. Use a stable sort to stabilize the output. + for (std::pair &SecSyms : AllSymbols) + stable_sort(SecSyms.second); + + assert((DisassembleFunctionSet.empty() || ShowDisassemblyOnly) && + "Functions to disassemble should be only specified together with " + "--show-disassembly-only"); + + if (ShowDisassemblyOnly) + outs() << "\nDisassembly of " << FileName << ":\n"; + + // Dissassemble a text section. + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isText()) + continue; + + uint64_t ImageLoadAddr = getPreferredBaseAddress(); + uint64_t SectionAddress = Section.getAddress() - ImageLoadAddr; + uint64_t SectSize = Section.getSize(); + if (!SectSize) + continue; + + // Register the text section. + TextSections.insert({SectionAddress, SectSize}); + + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (ShowDisassemblyOnly) { + outs() << "\nDisassembly of section " << SectionName; + outs() << " [" << format("0x%" PRIx64, Section.getAddress()) << ", " + << format("0x%" PRIx64, Section.getAddress() + SectSize) + << "]:\n\n"; + } + + if (isa(Obj) && SectionName == ".plt") + continue; + + // Get the section data. + ArrayRef Bytes = + arrayRefFromStringRef(unwrapOrError(Section.getContents(), FileName)); + + // Get the list of all the symbols in this section. + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + // Disassemble symbol by symbol. + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (!dissassembleSymbol(SI, Bytes, Symbols, Section)) + exitWithError("disassembling error", FileName); + } + } + + if (!AddrsWithInvalidInstruction.empty()) { + if (ShowDetailedWarning) { + for (auto &Addr : AddrsWithInvalidInstruction) { + WithColor::warning() + << "Invalid instructions at " << format("%8" PRIx64, Addr.first) + << " - " << format("%8" PRIx64, Addr.second) << "\n"; + } + } + WithColor::warning() << "Found " << AddrsWithInvalidInstruction.size() + << " invalid instructions\n"; + AddrsWithInvalidInstruction.clear(); + } + + // Dissassemble rodata section to check if FS discriminator symbol exists. + checkUseFSDiscriminator(Obj, AllSymbols); +} + +void ProfiledBinary::checkUseFSDiscriminator( + const ObjectFile *Obj, std::map &AllSymbols) { + const char *FSDiscriminatorVar = "__llvm_fs_discriminator__"; + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isData() || Section.getSize() == 0) + continue; + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (Symbols[SI].Name == FSDiscriminatorVar) { + UseFSDiscriminator = true; + return; + } + } + } +} + +void ProfiledBinary::populateElfSymbolAddressList( + const ELFObjectFileBase *Obj) { + // Create a mapping from virtual address to symbol GUID and the other way + // around. + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + uint64_t GUID = Function::getGUID(Name); + SymbolStartAddrs[GUID] = Addr; + StartAddrToSymMap.emplace(Addr, GUID); + } +} + +void ProfiledBinary::loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit) { + for (const auto &DieInfo : CompilationUnit.dies()) { + llvm::DWARFDie Die(&CompilationUnit, &DieInfo); + + if (!Die.isSubprogramDIE()) + continue; + auto Name = Die.getName(llvm::DINameKind::LinkageName); + if (!Name) + Name = Die.getName(llvm::DINameKind::ShortName); + if (!Name) + continue; + + auto RangesOrError = Die.getAddressRanges(); + if (!RangesOrError) + continue; + const DWARFAddressRangesVector &Ranges = RangesOrError.get(); + + if (Ranges.empty()) + continue; + + // Different DWARF symbols can have same function name, search or create + // BinaryFunction indexed by the name. + auto Ret = BinaryFunctions.emplace(Name, BinaryFunction()); + auto &Func = Ret.first->second; + if (Ret.second) + Func.FuncName = Ret.first->first; + + for (const auto &Range : Ranges) { + uint64_t StartAddress = Range.LowPC; + uint64_t EndAddress = Range.HighPC; + + if (EndAddress <= StartAddress || + StartAddress < getPreferredBaseAddress()) + continue; + + // We may want to know all ranges for one function. Here group the + // ranges and store them into BinaryFunction. + Func.Ranges.emplace_back(StartAddress, EndAddress); + + auto R = StartAddrToFuncRangeMap.emplace(StartAddress, FuncRange()); + if (R.second) { + FuncRange &FRange = R.first->second; + FRange.Func = &Func; + FRange.StartAddress = StartAddress; + FRange.EndAddress = EndAddress; + } else { + AddrsWithMultipleSymbols.insert(StartAddress); + if (ShowDetailedWarning) + WithColor::warning() + << "Duplicated symbol start address at " + << format("%8" PRIx64, StartAddress) << " " + << R.first->second.getFuncName() << " and " << Name << "\n"; + } + } + } +} + +void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) { + auto DebugContext = llvm::DWARFContext::create( + Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, DWPPath); + if (!DebugContext) + exitWithError("Error creating the debug info context", Path); + + for (const auto &CompilationUnit : DebugContext->compile_units()) + loadSymbolsFromDWARFUnit(*CompilationUnit); + + // Handles DWO sections that can either be in .o, .dwo or .dwp files. + uint32_t NumOfDWOMissing = 0; + for (const auto &CompilationUnit : DebugContext->compile_units()) { + DWARFUnit *const DwarfUnit = CompilationUnit.get(); + if (DwarfUnit->getDWOId()) { + DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit(); + if (!DWOCU->isDWOUnit()) { + NumOfDWOMissing++; + if (ShowDetailedWarning) { + std::string DWOName = dwarf::toString( + DwarfUnit->getUnitDIE().find( + {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), + ""); + WithColor::warning() << "DWO debug information for " << DWOName + << " was not loaded.\n"; + } + continue; + } + loadSymbolsFromDWARFUnit(*DWOCU); + } + } + + if (NumOfDWOMissing) + WithColor::warning() + << " DWO debug information was not loaded for " << NumOfDWOMissing + << " modules. Please check the .o, .dwo or .dwp path.\n"; + if (BinaryFunctions.empty()) + WithColor::warning() << "Loading of DWARF info completed, but no binary " + "functions have been retrieved.\n"; + // Populate the hash binary function map for MD5 function name lookup. This + // is done after BinaryFunctions are finalized. + for (auto &BinaryFunction : BinaryFunctions) { + HashBinaryFunctions[MD5Hash(StringRef(BinaryFunction.first))] = + &BinaryFunction.second; + } + + if (!AddrsWithMultipleSymbols.empty()) { + WithColor::warning() << "Found " << AddrsWithMultipleSymbols.size() + << " start addresses with multiple symbols\n"; + AddrsWithMultipleSymbols.clear(); + } +} + +void ProfiledBinary::populateSymbolListFromDWARF( + ProfileSymbolList &SymbolList) { + for (auto &I : StartAddrToFuncRangeMap) + SymbolList.add(I.second.getFuncName()); +} + +symbolize::LLVMSymbolizer::Options ProfiledBinary::getSymbolizerOpts() const { + symbolize::LLVMSymbolizer::Options SymbolizerOpts; + SymbolizerOpts.PrintFunctions = + DILineInfoSpecifier::FunctionNameKind::LinkageName; + SymbolizerOpts.Demangle = false; + SymbolizerOpts.DefaultArch = TheTriple.getArchName().str(); + SymbolizerOpts.UseSymbolTable = false; + SymbolizerOpts.RelativeAddresses = false; + SymbolizerOpts.DWPName = DWPPath; + return SymbolizerOpts; +} + +SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName, + bool UseProbeDiscriminator) { + assert(this == IP.Binary && + "Binary should only symbolize its own instruction"); + auto Addr = object::SectionedAddress{IP.Address, + object::SectionedAddress::UndefSection}; + DIInliningInfo InlineStack = unwrapOrError( + Symbolizer->symbolizeInlinedCode(SymbolizerPath.str(), Addr), + SymbolizerPath); + + SampleContextFrameVector CallStack; + for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) { + const auto &CallerFrame = InlineStack.getFrame(I); + if (CallerFrame.FunctionName.empty() || + (CallerFrame.FunctionName == "")) + break; + + StringRef FunctionName(CallerFrame.FunctionName); + if (UseCanonicalFnName) + FunctionName = FunctionSamples::getCanonicalFnName(FunctionName); + + uint32_t Discriminator = CallerFrame.Discriminator; + uint32_t LineOffset = (CallerFrame.Line - CallerFrame.StartLine) & 0xffff; + if (UseProbeDiscriminator) { + LineOffset = + PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator); + Discriminator = 0; + } + + LineLocation Line(LineOffset, Discriminator); + auto It = NameStrings.insert(FunctionName.str()); + CallStack.emplace_back(FunctionId(StringRef(*It.first)), Line); + } + + return CallStack; +} + +void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t RangeBegin, + uint64_t RangeEnd) { + InstructionPointer IP(this, RangeBegin, true); + + if (IP.Address != RangeBegin) + WithColor::warning() << "Invalid start instruction at " + << format("%8" PRIx64, RangeBegin) << "\n"; + + if (IP.Address >= RangeEnd) + return; + + do { + const SampleContextFrameVector SymbolizedCallStack = + getFrameLocationStack(IP.Address, UsePseudoProbes); + uint64_t Size = AddressToInstSizeMap[IP.Address]; + // Record instruction size for the corresponding context + FuncSizeTracker.addInstructionForContext(SymbolizedCallStack, Size); + + } while (IP.advance() && IP.Address < RangeEnd); +} + +void ProfiledBinary::computeInlinedContextSizeForFunc( + const BinaryFunction *Func) { + // Note that a function can be spilt into multiple ranges, so compute for all + // ranges of the function. + for (const auto &Range : Func->Ranges) + computeInlinedContextSizeForRange(Range.first, Range.second); + + // Track optimized-away inlinee for probed binary. A function inlined and then + // optimized away should still have their probes left over in places. + if (usePseudoProbes()) { + auto I = TopLevelProbeFrameMap.find(Func->FuncName); + if (I != TopLevelProbeFrameMap.end()) { + BinarySizeContextTracker::ProbeFrameStack ProbeContext; + FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder, *I->second, + ProbeContext); + } + } +} + +void ProfiledBinary::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + MissingContextInferrer->inferMissingFrames(Context, NewContext); +} + +InstructionPointer::InstructionPointer(const ProfiledBinary *Binary, + uint64_t Address, bool RoundToNext) + : Binary(Binary), Address(Address) { + Index = Binary->getIndexForAddr(Address); + if (RoundToNext) { + // we might get address which is not the code + // it should round to the next valid address + if (Index >= Binary->getCodeAddrVecSize()) + this->Address = UINT64_MAX; + else + this->Address = Binary->getAddressforIndex(Index); + } +} + +bool InstructionPointer::advance() { + Index++; + if (Index >= Binary->getCodeAddrVecSize()) { + Address = UINT64_MAX; + return false; + } + Address = Binary->getAddressforIndex(Index); + return true; +} + +bool InstructionPointer::backward() { + if (Index == 0) { + Address = 0; + return false; + } + Index--; + Address = Binary->getAddressforIndex(Index); + return true; +} + +void InstructionPointer::update(uint64_t Addr) { + Address = Addr; + Index = Binary->getIndexForAddr(Address); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-19.1/ProfiledBinary.h b/tools/ldc-profgen/ldc-profgen-19.1/ProfiledBinary.h new file mode 100644 index 00000000000..f2eeca45454 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/ProfiledBinary.h @@ -0,0 +1,619 @@ +//===-- ProfiledBinary.h - Binary decoder -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H +#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H + +#include "CallContext.h" +#include "ErrorHandling.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCPseudoProbe.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Path.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +extern cl::opt EnableCSPreInliner; +extern cl::opt UseContextCostForPreInliner; +} // namespace llvm + +using namespace llvm; +using namespace sampleprof; +using namespace llvm::object; + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; +class MissingFrameInferrer; + +struct InstructionPointer { + const ProfiledBinary *Binary; + // Address of the executable segment of the binary. + uint64_t Address; + // Index to the sorted code address array of the binary. + uint64_t Index = 0; + InstructionPointer(const ProfiledBinary *Binary, uint64_t Address, + bool RoundToNext = false); + bool advance(); + bool backward(); + void update(uint64_t Addr); +}; + +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + // This's also used to indicate the call stack should be truncated since this + // isn't a real call context the compiler will see. + ExternalAddr = 1, +}; + +using RangesTy = std::vector>; + +struct BinaryFunction { + StringRef FuncName; + // End of range is an exclusive bound. + RangesTy Ranges; + + uint64_t getFuncSize() { + uint64_t Sum = 0; + for (auto &R : Ranges) { + Sum += R.second - R.first; + } + return Sum; + } +}; + +// Info about function range. A function can be split into multiple +// non-continuous ranges, each range corresponds to one FuncRange. +struct FuncRange { + uint64_t StartAddress; + // EndAddress is an exclusive bound. + uint64_t EndAddress; + // Function the range belongs to + BinaryFunction *Func; + // Whether the start address is the real entry of the function. + bool IsFuncEntry = false; + + StringRef getFuncName() { return Func->FuncName; } +}; + +// PrologEpilog address tracker, used to filter out broken stack samples +// Currently we use a heuristic size (two) to infer prolog and epilog +// based on the start address and return address. In the future, +// we will switch to Dwarf CFI based tracker +struct PrologEpilogTracker { + // A set of prolog and epilog addresses. Used by virtual unwinding. + std::unordered_set PrologEpilogSet; + ProfiledBinary *Binary; + PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){}; + + // Take the two addresses from the start of function as prolog + void + inferPrologAddresses(std::map &FuncStartAddressMap) { + for (auto I : FuncStartAddressMap) { + PrologEpilogSet.insert(I.first); + InstructionPointer IP(Binary, I.first); + if (!IP.advance()) + break; + PrologEpilogSet.insert(IP.Address); + } + } + + // Take the last two addresses before the return address as epilog + void inferEpilogAddresses(std::unordered_set &RetAddrs) { + for (auto Addr : RetAddrs) { + PrologEpilogSet.insert(Addr); + InstructionPointer IP(Binary, Addr); + if (!IP.backward()) + break; + PrologEpilogSet.insert(IP.Address); + } + } +}; + +// Track function byte size under different context (outlined version as well as +// various inlined versions). It also provides query support to get function +// size with the best matching context, which is used to help pre-inliner use +// accurate post-optimization size to make decisions. +// TODO: If an inlinee is completely optimized away, ideally we should have zero +// for its context size, currently we would misss such context since it doesn't +// have instructions. To fix this, we need to mark all inlinee with entry probe +// but without instructions as having zero size. +class BinarySizeContextTracker { +public: + // Add instruction with given size to a context + void addInstructionForContext(const SampleContextFrameVector &Context, + uint32_t InstrSize); + + // Get function size with a specific context. When there's no exact match + // for the given context, try to retrieve the size of that function from + // closest matching context. + uint32_t getFuncSizeForContext(const ContextTrieNode *Context); + + // For inlinees that are full optimized away, we can establish zero size using + // their remaining probes. + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder); + + using ProbeFrameStack = SmallVector>; + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, + ProbeFrameStack &Context); + + void dump() { RootContext.dumpTree(); } + +private: + // Root node for context trie tree, node that this is a reverse context trie + // with callee as parent and caller as child. This way we can traverse from + // root to find the best/longest matching context if an exact match does not + // exist. It gives us the best possible estimate for function's post-inline, + // post-optimization byte size. + ContextTrieNode RootContext; +}; + +using AddressRange = std::pair; + +class ProfiledBinary { + // Absolute path of the executable binary. + std::string Path; + // Path of the debug info binary. + std::string DebugBinaryPath; + // The target triple. + Triple TheTriple; + // Path of symbolizer path which should be pointed to binary with debug info. + StringRef SymbolizerPath; + // Options used to configure the symbolizer + symbolize::LLVMSymbolizer::Options SymbolizerOpts; + // The runtime base address that the first executable segment is loaded at. + uint64_t BaseAddress = 0; + // The runtime base address that the first loadabe segment is loaded at. + uint64_t FirstLoadableAddress = 0; + // The preferred load address of each executable segment. + std::vector PreferredTextSegmentAddresses; + // The file offset of each executable segment. + std::vector TextSegmentOffsets; + + // Mutiple MC component info + std::unique_ptr MRI; + std::unique_ptr AsmInfo; + std::unique_ptr STI; + std::unique_ptr MII; + std::unique_ptr DisAsm; + std::unique_ptr MIA; + std::unique_ptr IPrinter; + // A list of text sections sorted by start RVA and size. Used to check + // if a given RVA is a valid code address. + std::set> TextSections; + + // A map of mapping function name to BinaryFunction info. + std::unordered_map BinaryFunctions; + + // Lookup BinaryFunctions using the function name's MD5 hash. Needed if the + // profile is using MD5. + std::unordered_map HashBinaryFunctions; + + // A list of binary functions that have samples. + std::unordered_set ProfiledFunctions; + + // GUID to Elf symbol start address map + DenseMap SymbolStartAddrs; + + // These maps are for temporary use of warning diagnosis. + DenseSet AddrsWithMultipleSymbols; + DenseSet> AddrsWithInvalidInstruction; + + // Start address to Elf symbol GUID map + std::unordered_multimap StartAddrToSymMap; + + // An ordered map of mapping function's start address to function range + // relevant info. Currently to determine if the offset of ELF is the start of + // a real function, we leverage the function range info from DWARF. + std::map StartAddrToFuncRangeMap; + + // Address to context location map. Used to expand the context. + std::unordered_map AddressToLocStackMap; + + // Address to instruction size map. Also used for quick Address lookup. + std::unordered_map AddressToInstSizeMap; + + // An array of Addresses of all instructions sorted in increasing order. The + // sorting is needed to fast advance to the next forward/backward instruction. + std::vector CodeAddressVec; + // A set of call instruction addresses. Used by virtual unwinding. + std::unordered_set CallAddressSet; + // A set of return instruction addresses. Used by virtual unwinding. + std::unordered_set RetAddressSet; + // An ordered set of unconditional branch instruction addresses. + std::set UncondBranchAddrSet; + // A set of branch instruction addresses. + std::unordered_set BranchAddressSet; + + // Estimate and track function prolog and epilog ranges. + PrologEpilogTracker ProEpilogTracker; + + // Infer missing frames due to compiler optimizations such as tail call + // elimination. + std::unique_ptr MissingContextInferrer; + + // Track function sizes under different context + BinarySizeContextTracker FuncSizeTracker; + + // The symbolizer used to get inline context for an instruction. + std::unique_ptr Symbolizer; + + // String table owning function name strings created from the symbolizer. + std::unordered_set NameStrings; + + // A collection of functions to print disassembly for. + StringSet<> DisassembleFunctionSet; + + // Pseudo probe decoder + MCPseudoProbeDecoder ProbeDecoder; + + // Function name to probe frame map for top-level outlined functions. + StringMap TopLevelProbeFrameMap; + + bool UsePseudoProbes = false; + + bool UseFSDiscriminator = false; + + // Whether we need to symbolize all instructions to get function context size. + bool TrackFuncContextSize = false; + + // Whether this is a kernel image; + bool IsKernel = false; + + // Indicate if the base loading address is parsed from the mmap event or uses + // the preferred address + bool IsLoadedByMMap = false; + // Use to avoid redundant warning. + bool MissingMMapWarned = false; + + bool IsCOFF = false; + + void setPreferredTextSegmentAddresses(const ObjectFile *O); + + template + void setPreferredTextSegmentAddresses(const ELFFile &Obj, + StringRef FileName); + void setPreferredTextSegmentAddresses(const COFFObjectFile *Obj, + StringRef FileName); + + void checkPseudoProbe(const ELFObjectFileBase *Obj); + + void decodePseudoProbe(const ELFObjectFileBase *Obj); + + void + checkUseFSDiscriminator(const ObjectFile *Obj, + std::map &AllSymbols); + + // Set up disassembler and related components. + void setUpDisassembler(const ObjectFile *Obj); + symbolize::LLVMSymbolizer::Options getSymbolizerOpts() const; + + // Load debug info of subprograms from DWARF section. + void loadSymbolsFromDWARF(ObjectFile &Obj); + + // Load debug info from DWARF unit. + void loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit); + + // Create elf symbol to its start address mapping. + void populateElfSymbolAddressList(const ELFObjectFileBase *O); + + // A function may be spilt into multiple non-continuous address ranges. We use + // this to set whether start a function range is the real entry of the + // function and also set false to the non-function label. + void setIsFuncEntry(FuncRange *FRange, StringRef RangeSymName); + + // Warn if no entry range exists in the function. + void warnNoFuncEntry(); + + /// Dissassemble the text section and build various address maps. + void disassemble(const ObjectFile *O); + + /// Helper function to dissassemble the symbol and extract info for unwinding + bool dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, const SectionRef &Section); + /// Symbolize a given instruction pointer and return a full call context. + SampleContextFrameVector symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName = false, + bool UseProbeDiscriminator = false); + /// Decode the interesting parts of the binary and build internal data + /// structures. On high level, the parts of interest are: + /// 1. Text sections, including the main code section and the PLT + /// entries that will be used to handle cross-module call transitions. + /// 2. The .debug_line section, used by Dwarf-based profile generation. + /// 3. Pseudo probe related sections, used by probe-based profile + /// generation. + void load(); + +public: + ProfiledBinary(const StringRef ExeBinPath, const StringRef DebugBinPath); + ~ProfiledBinary(); + + void decodePseudoProbe(); + + StringRef getPath() const { return Path; } + StringRef getName() const { return llvm::sys::path::filename(Path); } + uint64_t getBaseAddress() const { return BaseAddress; } + void setBaseAddress(uint64_t Address) { BaseAddress = Address; } + + bool isCOFF() const { return IsCOFF; } + + // Canonicalize to use preferred load address as base address. + uint64_t canonicalizeVirtualAddress(uint64_t Address) { + return Address - BaseAddress + getPreferredBaseAddress(); + } + // Return the preferred load address for the first executable segment. + uint64_t getPreferredBaseAddress() const { + return PreferredTextSegmentAddresses[0]; + } + // Return the preferred load address for the first loadable segment. + uint64_t getFirstLoadableAddress() const { return FirstLoadableAddress; } + // Return the file offset for the first executable segment. + uint64_t getTextSegmentOffset() const { return TextSegmentOffsets[0]; } + const std::vector &getPreferredTextSegmentAddresses() const { + return PreferredTextSegmentAddresses; + } + const std::vector &getTextSegmentOffsets() const { + return TextSegmentOffsets; + } + + uint64_t getInstSize(uint64_t Address) const { + auto I = AddressToInstSizeMap.find(Address); + if (I == AddressToInstSizeMap.end()) + return 0; + return I->second; + } + + bool addressIsCode(uint64_t Address) const { + return AddressToInstSizeMap.find(Address) != AddressToInstSizeMap.end(); + } + + bool addressIsCall(uint64_t Address) const { + return CallAddressSet.count(Address); + } + bool addressIsReturn(uint64_t Address) const { + return RetAddressSet.count(Address); + } + bool addressInPrologEpilog(uint64_t Address) const { + return ProEpilogTracker.PrologEpilogSet.count(Address); + } + + bool addressIsTransfer(uint64_t Address) { + return BranchAddressSet.count(Address) || RetAddressSet.count(Address) || + CallAddressSet.count(Address); + } + + bool rangeCrossUncondBranch(uint64_t Start, uint64_t End) { + if (Start >= End) + return false; + auto R = UncondBranchAddrSet.lower_bound(Start); + return R != UncondBranchAddrSet.end() && *R < End; + } + + uint64_t getAddressforIndex(uint64_t Index) const { + return CodeAddressVec[Index]; + } + + size_t getCodeAddrVecSize() const { return CodeAddressVec.size(); } + + bool usePseudoProbes() const { return UsePseudoProbes; } + bool useFSDiscriminator() const { return UseFSDiscriminator; } + bool isKernel() const { return IsKernel; } + + static bool isKernelImageName(StringRef BinaryName) { + return BinaryName == "[kernel.kallsyms]" || + BinaryName == "[kernel.kallsyms]_stext" || + BinaryName == "[kernel.kallsyms]_text"; + } + + // Get the index in CodeAddressVec for the address + // As we might get an address which is not the code + // here it would round to the next valid code address by + // using lower bound operation + uint32_t getIndexForAddr(uint64_t Address) const { + auto Low = llvm::lower_bound(CodeAddressVec, Address); + return Low - CodeAddressVec.begin(); + } + + uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + if (FrameAddr == ExternalAddr) + return ExternalAddr; + auto I = getIndexForAddr(FrameAddr); + FrameAddr = I ? getAddressforIndex(I - 1) : 0; + if (FrameAddr && addressIsCall(FrameAddr)) + return FrameAddr; + return 0; + } + + FuncRange *findFuncRangeForStartAddr(uint64_t Address) { + auto I = StartAddrToFuncRangeMap.find(Address); + if (I == StartAddrToFuncRangeMap.end()) + return nullptr; + return &I->second; + } + + // Binary search the function range which includes the input address. + FuncRange *findFuncRange(uint64_t Address) { + auto I = StartAddrToFuncRangeMap.upper_bound(Address); + if (I == StartAddrToFuncRangeMap.begin()) + return nullptr; + I--; + + if (Address >= I->second.EndAddress) + return nullptr; + + return &I->second; + } + + // Get all ranges of one function. + RangesTy getRanges(uint64_t Address) { + auto *FRange = findFuncRange(Address); + // Ignore the range which falls into plt section or system lib. + if (!FRange) + return RangesTy(); + + return FRange->Func->Ranges; + } + + const std::unordered_map & + getAllBinaryFunctions() { + return BinaryFunctions; + } + + std::unordered_set &getProfiledFunctions() { + return ProfiledFunctions; + } + + void setProfiledFunctions(std::unordered_set &Funcs) { + ProfiledFunctions = Funcs; + } + + BinaryFunction *getBinaryFunction(FunctionId FName) { + if (FName.isStringRef()) { + auto I = BinaryFunctions.find(FName.str()); + if (I == BinaryFunctions.end()) + return nullptr; + return &I->second; + } + auto I = HashBinaryFunctions.find(FName.getHashCode()); + if (I == HashBinaryFunctions.end()) + return nullptr; + return I->second; + } + + uint32_t getFuncSizeForContext(const ContextTrieNode *ContextNode) { + return FuncSizeTracker.getFuncSizeForContext(ContextNode); + } + + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + + // Load the symbols from debug table and populate into symbol list. + void populateSymbolListFromDWARF(ProfileSymbolList &SymbolList); + + SampleContextFrameVector + getFrameLocationStack(uint64_t Address, bool UseProbeDiscriminator = false) { + InstructionPointer IP(this, Address); + return symbolize(IP, SymbolizerOpts.UseSymbolTable, UseProbeDiscriminator); + } + + const SampleContextFrameVector & + getCachedFrameLocationStack(uint64_t Address, + bool UseProbeDiscriminator = false) { + auto I = AddressToLocStackMap.emplace(Address, SampleContextFrameVector()); + if (I.second) { + I.first->second = getFrameLocationStack(Address, UseProbeDiscriminator); + } + return I.first->second; + } + + std::optional getInlineLeafFrameLoc(uint64_t Address) { + const auto &Stack = getCachedFrameLocationStack(Address); + if (Stack.empty()) + return {}; + return Stack.back(); + } + + void flushSymbolizer() { Symbolizer.reset(); } + + MissingFrameInferrer *getMissingContextInferrer() { + return MissingContextInferrer.get(); + } + + // Compare two addresses' inline context + bool inlineContextEqual(uint64_t Add1, uint64_t Add2); + + // Get the full context of the current stack with inline context filled in. + // It will search the disassembling info stored in AddressToLocStackMap. This + // is used as the key of function sample map + SampleContextFrameVector + getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined); + // Go through instructions among the given range and record its size for the + // inline context. + void computeInlinedContextSizeForRange(uint64_t StartAddress, + uint64_t EndAddress); + + void computeInlinedContextSizeForFunc(const BinaryFunction *Func); + + const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const { + return ProbeDecoder.getCallProbeForAddr(Address); + } + + void getInlineContextForProbe(const MCDecodedPseudoProbe *Probe, + SampleContextFrameVector &InlineContextStack, + bool IncludeLeaf = false) const { + SmallVector ProbeInlineContext; + ProbeDecoder.getInlineContextForProbe(Probe, ProbeInlineContext, + IncludeLeaf); + for (uint32_t I = 0; I < ProbeInlineContext.size(); I++) { + auto &Callsite = ProbeInlineContext[I]; + // Clear the current context for an unknown probe. + if (Callsite.second == 0 && I != ProbeInlineContext.size() - 1) { + InlineContextStack.clear(); + continue; + } + InlineContextStack.emplace_back(FunctionId(Callsite.first), + LineLocation(Callsite.second, 0)); + } + } + const AddressProbesMap &getAddress2ProbesMap() const { + return ProbeDecoder.getAddress2ProbesMap(); + } + const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) { + return ProbeDecoder.getFuncDescForGUID(GUID); + } + + const MCPseudoProbeFuncDesc * + getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) { + return ProbeDecoder.getInlinerDescForProbe(Probe); + } + + bool getTrackFuncContextSize() { return TrackFuncContextSize; } + + bool getIsLoadedByMMap() { return IsLoadedByMMap; } + + void setIsLoadedByMMap(bool Value) { IsLoadedByMMap = Value; } + + bool getMissingMMapWarned() { return MissingMMapWarned; } + + void setMissingMMapWarned(bool Value) { MissingMMapWarned = Value; } +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-19.1/llvm-profgen.cpp b/tools/ldc-profgen/ldc-profgen-19.1/llvm-profgen.cpp new file mode 100644 index 00000000000..3b974e25103 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-19.1/llvm-profgen.cpp @@ -0,0 +1,193 @@ +//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// llvm-profgen generates SPGO profiles from perf script ouput. +// +//===----------------------------------------------------------------------===// + +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "ProfiledBinary.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/VirtualFileSystem.h" + +static cl::OptionCategory ProfGenCategory("ProfGen Options"); + +static cl::opt PerfScriptFilename( + "perfscript", cl::value_desc("perfscript"), + cl::desc("Path of perf-script trace created by Linux perf tool with " + "`script` command(the raw perf.data should be profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PSA("ps", cl::desc("Alias for --perfscript"), + cl::aliasopt(PerfScriptFilename)); + +static cl::opt PerfDataFilename( + "perfdata", cl::value_desc("perfdata"), + cl::desc("Path of raw perf data created by Linux perf tool (it should be " + "profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PDA("pd", cl::desc("Alias for --perfdata"), + cl::aliasopt(PerfDataFilename)); + +static cl::opt UnsymbolizedProfFilename( + "unsymbolized-profile", cl::value_desc("unsymbolized profile"), + cl::desc("Path of the unsymbolized profile created by " + "`llvm-profgen` with `--skip-symbolization`"), + cl::cat(ProfGenCategory)); +static cl::alias UPA("up", cl::desc("Alias for --unsymbolized-profile"), + cl::aliasopt(UnsymbolizedProfFilename)); + +static cl::opt SampleProfFilename( + "llvm-sample-profile", cl::value_desc("llvm sample profile"), + cl::desc("Path of the LLVM sample profile"), cl::cat(ProfGenCategory)); + +static cl::opt + BinaryPath("binary", cl::value_desc("binary"), cl::Required, + cl::desc("Path of profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt + ProcessId("pid", cl::value_desc("process Id"), cl::init(0), + cl::desc("Process Id for the profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt DebugBinPath( + "debug-binary", cl::value_desc("debug-binary"), + cl::desc("Path of debug info binary, llvm-profgen will load the DWARF info " + "from it instead of the executable binary."), + cl::cat(ProfGenCategory)); + +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt SkipSymbolization; + +using namespace llvm; +using namespace sampleprof; + +// Validate the command line input. +static void validateCommandLine() { + // Allow the missing perfscript if we only use to show binary disassembly. + if (!ShowDisassemblyOnly) { + // Validate input profile is provided only once + bool HasPerfData = PerfDataFilename.getNumOccurrences() > 0; + bool HasPerfScript = PerfScriptFilename.getNumOccurrences() > 0; + bool HasUnsymbolizedProfile = + UnsymbolizedProfFilename.getNumOccurrences() > 0; + bool HasSampleProfile = SampleProfFilename.getNumOccurrences() > 0; + uint16_t S = + HasPerfData + HasPerfScript + HasUnsymbolizedProfile + HasSampleProfile; + if (S != 1) { + std::string Msg = + S > 1 + ? "`--perfscript`, `--perfdata` and `--unsymbolized-profile` " + "cannot be used together." + : "Perf input file is missing, please use one of `--perfscript`, " + "`--perfdata` and `--unsymbolized-profile` for the input."; + exitWithError(Msg); + } + + auto CheckFileExists = [](bool H, StringRef File) { + if (H && !llvm::sys::fs::exists(File)) { + std::string Msg = "Input perf file(" + File.str() + ") doesn't exist."; + exitWithError(Msg); + } + }; + + CheckFileExists(HasPerfData, PerfDataFilename); + CheckFileExists(HasPerfScript, PerfScriptFilename); + CheckFileExists(HasUnsymbolizedProfile, UnsymbolizedProfFilename); + CheckFileExists(HasSampleProfile, SampleProfFilename); + } + + if (!llvm::sys::fs::exists(BinaryPath)) { + std::string Msg = "Input binary(" + BinaryPath + ") doesn't exist."; + exitWithError(Msg); + } + + if (CSProfileGenerator::MaxCompressionSize < -1) { + exitWithError("Value of --compress-recursion should >= -1"); + } + if (ShowSourceLocations && !ShowDisassemblyOnly) { + exitWithError("--show-source-locations should work together with " + "--show-disassembly-only!"); + } +} + +static PerfInputFile getPerfInputFile() { + PerfInputFile File; + if (PerfDataFilename.getNumOccurrences()) { + File.InputFile = PerfDataFilename; + File.Format = PerfFormat::PerfData; + } else if (PerfScriptFilename.getNumOccurrences()) { + File.InputFile = PerfScriptFilename; + File.Format = PerfFormat::PerfScript; + } else if (UnsymbolizedProfFilename.getNumOccurrences()) { + File.InputFile = UnsymbolizedProfFilename; + File.Format = PerfFormat::UnsymbolizedProfile; + } + return File; +} + +int main(int argc, const char *argv[]) { + InitLLVM X(argc, argv); + + // Initialize targets and assembly printers/parsers. + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllDisassemblers(); + + cl::HideUnrelatedOptions({&ProfGenCategory, &getColorCategory()}); + cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n"); + validateCommandLine(); + + // Load symbols and disassemble the code of a given binary. + std::unique_ptr Binary = + std::make_unique(BinaryPath, DebugBinPath); + if (ShowDisassemblyOnly) + return EXIT_SUCCESS; + + if (SampleProfFilename.getNumOccurrences()) { + LLVMContext Context; + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = + SampleProfileReader::create(SampleProfFilename, Context, *FS); + std::unique_ptr Reader = + std::move(ReaderOrErr.get()); + Reader->read(); + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), Reader->getProfiles(), + Reader->profileIsCS()); + Generator->generateProfile(); + Generator->write(); + } else { + std::optional PIDFilter; + if (ProcessId.getNumOccurrences()) + PIDFilter = ProcessId; + PerfInputFile PerfFile = getPerfInputFile(); + std::unique_ptr Reader = + PerfReaderBase::create(Binary.get(), PerfFile, PIDFilter); + // Parse perf events and samples + Reader->parsePerfTraces(); + + if (SkipSymbolization) + return EXIT_SUCCESS; + + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), &Reader->getSampleCounters(), + Reader->profileIsCS()); + Generator->generateProfile(); + Generator->write(); + } + + return EXIT_SUCCESS; +} diff --git a/utils/FileCheck-19.cpp b/utils/FileCheck-19.cpp new file mode 100644 index 00000000000..9cf3a3164df --- /dev/null +++ b/utils/FileCheck-19.cpp @@ -0,0 +1,879 @@ +//===- FileCheck.cpp - Check that File's Contents match what is expected --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// FileCheck does a line-by line check of a file that validates whether it +// contains the expected content. This is useful for regression tests etc. +// +// This program exits with an exit status of 2 on error, exit status of 0 if +// the file matched the expected contents, and exit status of 1 if it did not +// contain the expected contents. +// +//===----------------------------------------------------------------------===// + +#include "llvm/FileCheck/FileCheck.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/WithColor.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +using namespace llvm; + +static cl::extrahelp FileCheckOptsEnv( + "\nOptions are parsed from the environment variable FILECHECK_OPTS and\n" + "from the command line.\n"); + +static cl::opt + CheckFilename(cl::Positional, cl::desc(""), cl::Optional); + +static cl::opt + InputFilename("input-file", cl::desc("File to check (defaults to stdin)"), + cl::init("-"), cl::value_desc("filename")); + +static cl::list CheckPrefixes( + "check-prefix", + cl::desc("Prefix to use from check file (defaults to 'CHECK')")); +static cl::alias CheckPrefixesAlias( + "check-prefixes", cl::aliasopt(CheckPrefixes), cl::CommaSeparated, + cl::NotHidden, + cl::desc( + "Alias for -check-prefix permitting multiple comma separated values")); + +static cl::list CommentPrefixes( + "comment-prefixes", cl::CommaSeparated, cl::Hidden, + cl::desc("Comma-separated list of comment prefixes to use from check file\n" + "(defaults to 'COM,RUN'). Please avoid using this feature in\n" + "LLVM's LIT-based test suites, which should be easier to\n" + "maintain if they all follow a consistent comment style. This\n" + "feature is meant for non-LIT test suites using FileCheck.")); + +static cl::opt NoCanonicalizeWhiteSpace( + "strict-whitespace", + cl::desc("Do not treat all horizontal whitespace as equivalent")); + +static cl::opt IgnoreCase( + "ignore-case", + cl::desc("Use case-insensitive matching")); + +static cl::list ImplicitCheckNot( + "implicit-check-not", + cl::desc("Add an implicit negative check with this pattern to every\n" + "positive check. This can be used to ensure that no instances of\n" + "this pattern occur which are not matched by a positive pattern"), + cl::value_desc("pattern")); + +static cl::list + GlobalDefines("D", cl::AlwaysPrefix, + cl::desc("Define a variable to be used in capture patterns."), + cl::value_desc("VAR=VALUE")); + +static cl::opt AllowEmptyInput( + "allow-empty", cl::init(false), + cl::desc("Allow the input file to be empty. This is useful when making\n" + "checks that some error message does not occur, for example.")); + +static cl::opt AllowUnusedPrefixes( + "allow-unused-prefixes", + cl::desc("Allow prefixes to be specified but not appear in the test.")); + +static cl::opt MatchFullLines( + "match-full-lines", cl::init(false), + cl::desc("Require all positive matches to cover an entire input line.\n" + "Allows leading and trailing whitespace if --strict-whitespace\n" + "is not also passed.")); + +static cl::opt EnableVarScope( + "enable-var-scope", cl::init(false), + cl::desc("Enables scope for regex variables. Variables with names that\n" + "do not start with '$' will be reset at the beginning of\n" + "each CHECK-LABEL block.")); + +static cl::opt AllowDeprecatedDagOverlap( + "allow-deprecated-dag-overlap", cl::init(false), + cl::desc("Enable overlapping among matches in a group of consecutive\n" + "CHECK-DAG directives. This option is deprecated and is only\n" + "provided for convenience as old tests are migrated to the new\n" + "non-overlapping CHECK-DAG implementation.\n")); + +static cl::opt Verbose( + "v", + cl::desc("Print directive pattern matches, or add them to the input dump\n" + "if enabled.\n")); + +static cl::opt VerboseVerbose( + "vv", + cl::desc("Print information helpful in diagnosing internal FileCheck\n" + "issues, or add it to the input dump if enabled. Implies\n" + "-v.\n")); + +// The order of DumpInputValue members affects their precedence, as documented +// for -dump-input below. +enum DumpInputValue { + DumpInputNever, + DumpInputFail, + DumpInputAlways, + DumpInputHelp +}; + +static cl::list DumpInputs( + "dump-input", + cl::desc("Dump input to stderr, adding annotations representing\n" + "currently enabled diagnostics. When there are multiple\n" + "occurrences of this option, the that appears earliest\n" + "in the list below has precedence. The default is 'fail'.\n"), + cl::value_desc("mode"), + cl::values(clEnumValN(DumpInputHelp, "help", "Explain input dump and quit"), + clEnumValN(DumpInputAlways, "always", "Always dump input"), + clEnumValN(DumpInputFail, "fail", "Dump input on failure"), + clEnumValN(DumpInputNever, "never", "Never dump input"))); + +// The order of DumpInputFilterValue members affects their precedence, as +// documented for -dump-input-filter below. +enum DumpInputFilterValue { + DumpInputFilterError, + DumpInputFilterAnnotation, + DumpInputFilterAnnotationFull, + DumpInputFilterAll +}; + +static cl::list DumpInputFilters( + "dump-input-filter", + cl::desc("In the dump requested by -dump-input, print only input lines of\n" + "kind plus any context specified by -dump-input-context.\n" + "When there are multiple occurrences of this option, the \n" + "that appears earliest in the list below has precedence. The\n" + "default is 'error' when -dump-input=fail, and it's 'all' when\n" + "-dump-input=always.\n"), + cl::values(clEnumValN(DumpInputFilterAll, "all", "All input lines"), + clEnumValN(DumpInputFilterAnnotationFull, "annotation-full", + "Input lines with annotations"), + clEnumValN(DumpInputFilterAnnotation, "annotation", + "Input lines with starting points of annotations"), + clEnumValN(DumpInputFilterError, "error", + "Input lines with starting points of error " + "annotations"))); + +static cl::list DumpInputContexts( + "dump-input-context", cl::value_desc("N"), + cl::desc("In the dump requested by -dump-input, print input lines\n" + "before and input lines after any lines specified by\n" + "-dump-input-filter. When there are multiple occurrences of\n" + "this option, the largest specified has precedence. The\n" + "default is 5.\n")); + +typedef cl::list::const_iterator prefix_iterator; + + + + + + + +static void DumpCommandLine(int argc, char **argv) { + errs() << "FileCheck command line: "; + for (int I = 0; I < argc; I++) + errs() << " " << argv[I]; + errs() << "\n"; +} + +struct MarkerStyle { + /// The starting char (before tildes) for marking the line. + char Lead; + /// What color to use for this annotation. + raw_ostream::Colors Color; + /// A note to follow the marker, or empty string if none. + std::string Note; + /// Does this marker indicate inclusion by -dump-input-filter=error? + bool FiltersAsError; + MarkerStyle() {} + MarkerStyle(char Lead, raw_ostream::Colors Color, + const std::string &Note = "", bool FiltersAsError = false) + : Lead(Lead), Color(Color), Note(Note), FiltersAsError(FiltersAsError) { + assert((!FiltersAsError || !Note.empty()) && + "expected error diagnostic to have note"); + } +}; + +static MarkerStyle GetMarker(FileCheckDiag::MatchType MatchTy) { + switch (MatchTy) { + case FileCheckDiag::MatchFoundAndExpected: + return MarkerStyle('^', raw_ostream::GREEN); + case FileCheckDiag::MatchFoundButExcluded: + return MarkerStyle('!', raw_ostream::RED, "error: no match expected", + /*FiltersAsError=*/true); + case FileCheckDiag::MatchFoundButWrongLine: + return MarkerStyle('!', raw_ostream::RED, "error: match on wrong line", + /*FiltersAsError=*/true); + case FileCheckDiag::MatchFoundButDiscarded: + return MarkerStyle('!', raw_ostream::CYAN, + "discard: overlaps earlier match"); + case FileCheckDiag::MatchFoundErrorNote: + // Note should always be overridden within the FileCheckDiag. + return MarkerStyle('!', raw_ostream::RED, + "error: unknown error after match", + /*FiltersAsError=*/true); + case FileCheckDiag::MatchNoneAndExcluded: + return MarkerStyle('X', raw_ostream::GREEN); + case FileCheckDiag::MatchNoneButExpected: + return MarkerStyle('X', raw_ostream::RED, "error: no match found", + /*FiltersAsError=*/true); + case FileCheckDiag::MatchNoneForInvalidPattern: + return MarkerStyle('X', raw_ostream::RED, + "error: match failed for invalid pattern", + /*FiltersAsError=*/true); + case FileCheckDiag::MatchFuzzy: + return MarkerStyle('?', raw_ostream::MAGENTA, "possible intended match", + /*FiltersAsError=*/true); + } + llvm_unreachable_internal("unexpected match type"); +} + +static void DumpInputAnnotationHelp(raw_ostream &OS) { + OS << "The following description was requested by -dump-input=help to\n" + << "explain the input dump printed by FileCheck.\n" + << "\n" + << "Related command-line options:\n" + << "\n" + << " - -dump-input= enables or disables the input dump\n" + << " - -dump-input-filter= filters the input lines\n" + << " - -dump-input-context= adjusts the context of filtered lines\n" + << " - -v and -vv add more annotations\n" + << " - -color forces colors to be enabled both in the dump and below\n" + << " - -help documents the above options in more detail\n" + << "\n" + << "These options can also be set via FILECHECK_OPTS. For example, for\n" + << "maximum debugging output on failures:\n" + << "\n" + << " $ FILECHECK_OPTS='-dump-input-filter=all -vv -color' ninja check\n" + << "\n" + << "Input dump annotation format:\n" + << "\n"; + + // Labels for input lines. + OS << " - "; + WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "L:"; + OS << " labels line number L of the input file\n" + << " An extra space is added after each input line to represent" + << " the\n" + << " newline character\n"; + + // Labels for annotation lines. + OS << " - "; + WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "T:L"; + OS << " labels the only match result for either (1) a pattern of type T" + << " from\n" + << " line L of the check file if L is an integer or (2) the" + << " I-th implicit\n" + << " pattern if L is \"imp\" followed by an integer " + << "I (index origin one)\n"; + OS << " - "; + WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "T:L'N"; + OS << " labels the Nth match result for such a pattern\n"; + + // Markers on annotation lines. + OS << " - "; + WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "^~~"; + OS << " marks good match (reported if -v)\n" + << " - "; + WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "!~~"; + OS << " marks bad match, such as:\n" + << " - CHECK-NEXT on same line as previous match (error)\n" + << " - CHECK-NOT found (error)\n" + << " - CHECK-DAG overlapping match (discarded, reported if " + << "-vv)\n" + << " - "; + WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "X~~"; + OS << " marks search range when no match is found, such as:\n" + << " - CHECK-NEXT not found (error)\n" + << " - CHECK-NOT not found (success, reported if -vv)\n" + << " - CHECK-DAG not found after discarded matches (error)\n" + << " - "; + WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "?"; + OS << " marks fuzzy match when no match is found\n"; + + // Elided lines. + OS << " - "; + WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "..."; + OS << " indicates elided input lines and annotations, as specified by\n" + << " -dump-input-filter and -dump-input-context\n"; + + // Colors. + OS << " - colors "; + WithColor(OS, raw_ostream::GREEN, true) << "success"; + OS << ", "; + WithColor(OS, raw_ostream::RED, true) << "error"; + OS << ", "; + WithColor(OS, raw_ostream::MAGENTA, true) << "fuzzy match"; + OS << ", "; + WithColor(OS, raw_ostream::CYAN, true, false) << "discarded match"; + OS << ", "; + WithColor(OS, raw_ostream::CYAN, true, true) << "unmatched input"; + OS << "\n"; +} + +/// An annotation for a single input line. +struct InputAnnotation { + /// The index of the match result across all checks + unsigned DiagIndex; + /// The label for this annotation. + std::string Label; + /// Is this the initial fragment of a diagnostic that has been broken across + /// multiple lines? + bool IsFirstLine; + /// What input line (one-origin indexing) this annotation marks. This might + /// be different from the starting line of the original diagnostic if + /// !IsFirstLine. + unsigned InputLine; + /// The column range (one-origin indexing, open end) in which to mark the + /// input line. If InputEndCol is UINT_MAX, treat it as the last column + /// before the newline. + unsigned InputStartCol, InputEndCol; + /// The marker to use. + MarkerStyle Marker; + /// Whether this annotation represents a good match for an expected pattern. + bool FoundAndExpectedMatch; +}; + +/// Get an abbreviation for the check type. +static std::string GetCheckTypeAbbreviation(Check::FileCheckType Ty) { + switch (Ty) { + case Check::CheckPlain: + if (Ty.getCount() > 1) + return "count"; + return "check"; + case Check::CheckNext: + return "next"; + case Check::CheckSame: + return "same"; + case Check::CheckNot: + return "not"; + case Check::CheckDAG: + return "dag"; + case Check::CheckLabel: + return "label"; + case Check::CheckEmpty: + return "empty"; + case Check::CheckComment: + return "com"; + case Check::CheckEOF: + return "eof"; + case Check::CheckBadNot: + return "bad-not"; + case Check::CheckBadCount: + return "bad-count"; + case Check::CheckMisspelled: + return "misspelled"; + case Check::CheckNone: + llvm_unreachable("invalid FileCheckType"); + } + llvm_unreachable("unknown FileCheckType"); +} + +static void +BuildInputAnnotations(const SourceMgr &SM, unsigned CheckFileBufferID, + const std::pair &ImpPatBufferIDRange, + const std::vector &Diags, + std::vector &Annotations, + unsigned &LabelWidth) { + struct CompareSMLoc { + bool operator()(const SMLoc &LHS, const SMLoc &RHS) const { + return LHS.getPointer() < RHS.getPointer(); + } + }; + // How many diagnostics does each pattern have? + std::map DiagCountPerPattern; + for (const FileCheckDiag &Diag : Diags) + ++DiagCountPerPattern[Diag.CheckLoc]; + // How many diagnostics have we seen so far per pattern? + std::map DiagIndexPerPattern; + // How many total diagnostics have we seen so far? + unsigned DiagIndex = 0; + // What's the widest label? + LabelWidth = 0; + for (auto DiagItr = Diags.begin(), DiagEnd = Diags.end(); DiagItr != DiagEnd; + ++DiagItr) { + InputAnnotation A; + A.DiagIndex = DiagIndex++; + + // Build label, which uniquely identifies this check result. + unsigned CheckBufferID = SM.FindBufferContainingLoc(DiagItr->CheckLoc); + auto CheckLineAndCol = + SM.getLineAndColumn(DiagItr->CheckLoc, CheckBufferID); + llvm::raw_string_ostream Label(A.Label); + Label << GetCheckTypeAbbreviation(DiagItr->CheckTy) << ":"; + if (CheckBufferID == CheckFileBufferID) + Label << CheckLineAndCol.first; + else if (ImpPatBufferIDRange.first <= CheckBufferID && + CheckBufferID < ImpPatBufferIDRange.second) + Label << "imp" << (CheckBufferID - ImpPatBufferIDRange.first + 1); + else + llvm_unreachable("expected diagnostic's check location to be either in " + "the check file or for an implicit pattern"); + if (DiagCountPerPattern[DiagItr->CheckLoc] > 1) + Label << "'" << DiagIndexPerPattern[DiagItr->CheckLoc]++; + LabelWidth = std::max((std::string::size_type)LabelWidth, A.Label.size()); + + A.Marker = GetMarker(DiagItr->MatchTy); + if (!DiagItr->Note.empty()) { + A.Marker.Note = DiagItr->Note; + // It's less confusing if notes that don't actually have ranges don't have + // markers. For example, a marker for 'with "VAR" equal to "5"' would + // seem to indicate where "VAR" matches, but the location we actually have + // for the marker simply points to the start of the match/search range for + // the full pattern of which the substitution is potentially just one + // component. + if (DiagItr->InputStartLine == DiagItr->InputEndLine && + DiagItr->InputStartCol == DiagItr->InputEndCol) + A.Marker.Lead = ' '; + } + if (DiagItr->MatchTy == FileCheckDiag::MatchFoundErrorNote) { + assert(!DiagItr->Note.empty() && + "expected custom note for MatchFoundErrorNote"); + A.Marker.Note = "error: " + A.Marker.Note; + } + A.FoundAndExpectedMatch = + DiagItr->MatchTy == FileCheckDiag::MatchFoundAndExpected; + + // Compute the mark location, and break annotation into multiple + // annotations if it spans multiple lines. + A.IsFirstLine = true; + A.InputLine = DiagItr->InputStartLine; + A.InputStartCol = DiagItr->InputStartCol; + if (DiagItr->InputStartLine == DiagItr->InputEndLine) { + // Sometimes ranges are empty in order to indicate a specific point, but + // that would mean nothing would be marked, so adjust the range to + // include the following character. + A.InputEndCol = + std::max(DiagItr->InputStartCol + 1, DiagItr->InputEndCol); + Annotations.push_back(A); + } else { + assert(DiagItr->InputStartLine < DiagItr->InputEndLine && + "expected input range not to be inverted"); + A.InputEndCol = UINT_MAX; + Annotations.push_back(A); + for (unsigned L = DiagItr->InputStartLine + 1, E = DiagItr->InputEndLine; + L <= E; ++L) { + // If a range ends before the first column on a line, then it has no + // characters on that line, so there's nothing to render. + if (DiagItr->InputEndCol == 1 && L == E) + break; + InputAnnotation B; + B.DiagIndex = A.DiagIndex; + B.Label = A.Label; + B.IsFirstLine = false; + B.InputLine = L; + B.Marker = A.Marker; + B.Marker.Lead = '~'; + B.Marker.Note = ""; + B.InputStartCol = 1; + if (L != E) + B.InputEndCol = UINT_MAX; + else + B.InputEndCol = DiagItr->InputEndCol; + B.FoundAndExpectedMatch = A.FoundAndExpectedMatch; + Annotations.push_back(B); + } + } + } +} + +static unsigned FindInputLineInFilter( + DumpInputFilterValue DumpInputFilter, unsigned CurInputLine, + const std::vector::iterator &AnnotationBeg, + const std::vector::iterator &AnnotationEnd) { + if (DumpInputFilter == DumpInputFilterAll) + return CurInputLine; + for (auto AnnotationItr = AnnotationBeg; AnnotationItr != AnnotationEnd; + ++AnnotationItr) { + switch (DumpInputFilter) { + case DumpInputFilterAll: + llvm_unreachable("unexpected DumpInputFilterAll"); + break; + case DumpInputFilterAnnotationFull: + return AnnotationItr->InputLine; + case DumpInputFilterAnnotation: + if (AnnotationItr->IsFirstLine) + return AnnotationItr->InputLine; + break; + case DumpInputFilterError: + if (AnnotationItr->IsFirstLine && AnnotationItr->Marker.FiltersAsError) + return AnnotationItr->InputLine; + break; + } + } + return UINT_MAX; +} + +/// To OS, print a vertical ellipsis (right-justified at LabelWidth) if it would +/// occupy less lines than ElidedLines, but print ElidedLines otherwise. Either +/// way, clear ElidedLines. Thus, if ElidedLines is empty, do nothing. +static void DumpEllipsisOrElidedLines(raw_ostream &OS, std::string &ElidedLines, + unsigned LabelWidth) { + if (ElidedLines.empty()) + return; + unsigned EllipsisLines = 3; + if (EllipsisLines < StringRef(ElidedLines).count('\n')) { + for (unsigned i = 0; i < EllipsisLines; ++i) { + WithColor(OS, raw_ostream::BLACK, /*Bold=*/true) + << right_justify(".", LabelWidth); + OS << '\n'; + } + } else + OS << ElidedLines; + ElidedLines.clear(); +} + +static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req, + DumpInputFilterValue DumpInputFilter, + unsigned DumpInputContext, + StringRef InputFileText, + std::vector &Annotations, + unsigned LabelWidth) { + OS << "Input was:\n<<<<<<\n"; + + // Sort annotations. + llvm::sort(Annotations, + [](const InputAnnotation &A, const InputAnnotation &B) { + // 1. Sort annotations in the order of the input lines. + // + // This makes it easier to find relevant annotations while + // iterating input lines in the implementation below. FileCheck + // does not always produce diagnostics in the order of input + // lines due to, for example, CHECK-DAG and CHECK-NOT. + if (A.InputLine != B.InputLine) + return A.InputLine < B.InputLine; + // 2. Sort annotations in the temporal order FileCheck produced + // their associated diagnostics. + // + // This sort offers several benefits: + // + // A. On a single input line, the order of annotations reflects + // the FileCheck logic for processing directives/patterns. + // This can be helpful in understanding cases in which the + // order of the associated directives/patterns in the check + // file or on the command line either (i) does not match the + // temporal order in which FileCheck looks for matches for the + // directives/patterns (due to, for example, CHECK-LABEL, + // CHECK-NOT, or `--implicit-check-not`) or (ii) does match + // that order but does not match the order of those + // diagnostics along an input line (due to, for example, + // CHECK-DAG). + // + // On the other hand, because our presentation format presents + // input lines in order, there's no clear way to offer the + // same benefit across input lines. For consistency, it might + // then seem worthwhile to have annotations on a single line + // also sorted in input order (that is, by input column). + // However, in practice, this appears to be more confusing + // than helpful. Perhaps it's intuitive to expect annotations + // to be listed in the temporal order in which they were + // produced except in cases the presentation format obviously + // and inherently cannot support it (that is, across input + // lines). + // + // B. When diagnostics' annotations are split among multiple + // input lines, the user must track them from one input line + // to the next. One property of the sort chosen here is that + // it facilitates the user in this regard by ensuring the + // following: when comparing any two input lines, a + // diagnostic's annotations are sorted in the same position + // relative to all other diagnostics' annotations. + return A.DiagIndex < B.DiagIndex; + }); + + // Compute the width of the label column. + const unsigned char *InputFilePtr = InputFileText.bytes_begin(), + *InputFileEnd = InputFileText.bytes_end(); + unsigned LineCount = InputFileText.count('\n'); + if (InputFileEnd[-1] != '\n') + ++LineCount; + unsigned LineNoWidth = std::log10(LineCount) + 1; + // +3 below adds spaces (1) to the left of the (right-aligned) line numbers + // on input lines and (2) to the right of the (left-aligned) labels on + // annotation lines so that input lines and annotation lines are more + // visually distinct. For example, the spaces on the annotation lines ensure + // that input line numbers and check directive line numbers never align + // horizontally. Those line numbers might not even be for the same file. + // One space would be enough to achieve that, but more makes it even easier + // to see. + LabelWidth = std::max(LabelWidth, LineNoWidth) + 3; + + // Print annotated input lines. + unsigned PrevLineInFilter = 0; // 0 means none so far + unsigned NextLineInFilter = 0; // 0 means uncomputed, UINT_MAX means none + std::string ElidedLines; + raw_string_ostream ElidedLinesOS(ElidedLines); + ColorMode TheColorMode = + WithColor(OS).colorsEnabled() ? ColorMode::Enable : ColorMode::Disable; + if (TheColorMode == ColorMode::Enable) + ElidedLinesOS.enable_colors(true); + auto AnnotationItr = Annotations.begin(), AnnotationEnd = Annotations.end(); + for (unsigned Line = 1; + InputFilePtr != InputFileEnd || AnnotationItr != AnnotationEnd; + ++Line) { + const unsigned char *InputFileLine = InputFilePtr; + + // Compute the previous and next line included by the filter. + if (NextLineInFilter < Line) + NextLineInFilter = FindInputLineInFilter(DumpInputFilter, Line, + AnnotationItr, AnnotationEnd); + assert(NextLineInFilter && "expected NextLineInFilter to be computed"); + if (NextLineInFilter == Line) + PrevLineInFilter = Line; + + // Elide this input line and its annotations if it's not within the + // context specified by -dump-input-context of an input line included by + // -dump-input-filter. However, in case the resulting ellipsis would occupy + // more lines than the input lines and annotations it elides, buffer the + // elided lines and annotations so we can print them instead. + raw_ostream *LineOS; + if ((!PrevLineInFilter || PrevLineInFilter + DumpInputContext < Line) && + (NextLineInFilter == UINT_MAX || + Line + DumpInputContext < NextLineInFilter)) + LineOS = &ElidedLinesOS; + else { + LineOS = &OS; + DumpEllipsisOrElidedLines(OS, ElidedLines, LabelWidth); + } + + // Print right-aligned line number. + WithColor(*LineOS, raw_ostream::BLACK, /*Bold=*/true, /*BF=*/false, + TheColorMode) + << format_decimal(Line, LabelWidth) << ": "; + + // For the case where -v and colors are enabled, find the annotations for + // good matches for expected patterns in order to highlight everything + // else in the line. There are no such annotations if -v is disabled. + std::vector FoundAndExpectedMatches; + if (Req.Verbose && TheColorMode == ColorMode::Enable) { + for (auto I = AnnotationItr; I != AnnotationEnd && I->InputLine == Line; + ++I) { + if (I->FoundAndExpectedMatch) + FoundAndExpectedMatches.push_back(*I); + } + } + + // Print numbered line with highlighting where there are no matches for + // expected patterns. + bool Newline = false; + { + WithColor COS(*LineOS, raw_ostream::SAVEDCOLOR, /*Bold=*/false, + /*BG=*/false, TheColorMode); + bool InMatch = false; + if (Req.Verbose) + COS.changeColor(raw_ostream::CYAN, true, true); + for (unsigned Col = 1; InputFilePtr != InputFileEnd && !Newline; ++Col) { + bool WasInMatch = InMatch; + InMatch = false; + for (const InputAnnotation &M : FoundAndExpectedMatches) { + if (M.InputStartCol <= Col && Col < M.InputEndCol) { + InMatch = true; + break; + } + } + if (!WasInMatch && InMatch) + COS.resetColor(); + else if (WasInMatch && !InMatch) + COS.changeColor(raw_ostream::CYAN, true, true); + if (*InputFilePtr == '\n') { + Newline = true; + COS << ' '; + } else + COS << *InputFilePtr; + ++InputFilePtr; + } + } + *LineOS << '\n'; + unsigned InputLineWidth = InputFilePtr - InputFileLine; + + // Print any annotations. + while (AnnotationItr != AnnotationEnd && + AnnotationItr->InputLine == Line) { + WithColor COS(*LineOS, AnnotationItr->Marker.Color, /*Bold=*/true, + /*BG=*/false, TheColorMode); + // The two spaces below are where the ": " appears on input lines. + COS << left_justify(AnnotationItr->Label, LabelWidth) << " "; + unsigned Col; + for (Col = 1; Col < AnnotationItr->InputStartCol; ++Col) + COS << ' '; + COS << AnnotationItr->Marker.Lead; + // If InputEndCol=UINT_MAX, stop at InputLineWidth. + for (++Col; Col < AnnotationItr->InputEndCol && Col <= InputLineWidth; + ++Col) + COS << '~'; + const std::string &Note = AnnotationItr->Marker.Note; + if (!Note.empty()) { + // Put the note at the end of the input line. If we were to instead + // put the note right after the marker, subsequent annotations for the + // same input line might appear to mark this note instead of the input + // line. + for (; Col <= InputLineWidth; ++Col) + COS << ' '; + COS << ' ' << Note; + } + COS << '\n'; + ++AnnotationItr; + } + } + DumpEllipsisOrElidedLines(OS, ElidedLines, LabelWidth); + + OS << ">>>>>>\n"; +} + +int main(int argc, char **argv) { + // Enable use of ANSI color codes because FileCheck is using them to + // highlight text. + llvm::sys::Process::UseANSIEscapeCodes(true); + + InitLLVM X(argc, argv); + cl::ParseCommandLineOptions(argc, argv, /*Overview*/ "", /*Errs*/ nullptr, + "FILECHECK_OPTS"); + + // Select -dump-input* values. The -help documentation specifies the default + // value and which value to choose if an option is specified multiple times. + // In the latter case, the general rule of thumb is to choose the value that + // provides the most information. + DumpInputValue DumpInput = + DumpInputs.empty() ? DumpInputFail : *llvm::max_element(DumpInputs); + DumpInputFilterValue DumpInputFilter; + if (DumpInputFilters.empty()) + DumpInputFilter = DumpInput == DumpInputAlways ? DumpInputFilterAll + : DumpInputFilterError; + else + DumpInputFilter = *llvm::max_element(DumpInputFilters); + unsigned DumpInputContext = + DumpInputContexts.empty() ? 5 : *llvm::max_element(DumpInputContexts); + + if (DumpInput == DumpInputHelp) { + DumpInputAnnotationHelp(outs()); + return 0; + } + if (CheckFilename.empty()) { + errs() << " not specified\n"; + return 2; + } + + FileCheckRequest Req; + append_range(Req.CheckPrefixes, CheckPrefixes); + + append_range(Req.CommentPrefixes, CommentPrefixes); + + append_range(Req.ImplicitCheckNot, ImplicitCheckNot); + + bool GlobalDefineError = false; + for (StringRef G : GlobalDefines) { + size_t EqIdx = G.find('='); + if (EqIdx == std::string::npos) { + errs() << "Missing equal sign in command-line definition '-D" << G + << "'\n"; + GlobalDefineError = true; + continue; + } + if (EqIdx == 0) { + errs() << "Missing variable name in command-line definition '-D" << G + << "'\n"; + GlobalDefineError = true; + continue; + } + Req.GlobalDefines.push_back(G); + } + if (GlobalDefineError) + return 2; + + Req.AllowEmptyInput = AllowEmptyInput; + Req.AllowUnusedPrefixes = AllowUnusedPrefixes; + Req.EnableVarScope = EnableVarScope; + Req.AllowDeprecatedDagOverlap = AllowDeprecatedDagOverlap; + Req.Verbose = Verbose; + Req.VerboseVerbose = VerboseVerbose; + Req.NoCanonicalizeWhiteSpace = NoCanonicalizeWhiteSpace; + Req.MatchFullLines = MatchFullLines; + Req.IgnoreCase = IgnoreCase; + + if (VerboseVerbose) + Req.Verbose = true; + + FileCheck FC(Req); + if (!FC.ValidateCheckPrefixes()) + return 2; + + SourceMgr SM; + + // Read the expected strings from the check file. + ErrorOr> CheckFileOrErr = + MemoryBuffer::getFileOrSTDIN(CheckFilename, /*IsText=*/true); + if (std::error_code EC = CheckFileOrErr.getError()) { + errs() << "Could not open check file '" << CheckFilename + << "': " << EC.message() << '\n'; + return 2; + } + MemoryBuffer &CheckFile = *CheckFileOrErr.get(); + + SmallString<4096> CheckFileBuffer; + StringRef CheckFileText = FC.CanonicalizeFile(CheckFile, CheckFileBuffer); + + unsigned CheckFileBufferID = + SM.AddNewSourceBuffer(MemoryBuffer::getMemBuffer( + CheckFileText, CheckFile.getBufferIdentifier()), + SMLoc()); + + std::pair ImpPatBufferIDRange; + if (FC.readCheckFile(SM, CheckFileText, &ImpPatBufferIDRange)) + return 2; + + // Open the file to check and add it to SourceMgr. + ErrorOr> InputFileOrErr = + MemoryBuffer::getFileOrSTDIN(InputFilename, /*IsText=*/true); + if (InputFilename == "-") + InputFilename = ""; // Overwrite for improved diagnostic messages + if (std::error_code EC = InputFileOrErr.getError()) { + errs() << "Could not open input file '" << InputFilename + << "': " << EC.message() << '\n'; + return 2; + } + MemoryBuffer &InputFile = *InputFileOrErr.get(); + + if (InputFile.getBufferSize() == 0 && !AllowEmptyInput) { + errs() << "FileCheck error: '" << InputFilename << "' is empty.\n"; + DumpCommandLine(argc, argv); + return 2; + } + + SmallString<4096> InputFileBuffer; + StringRef InputFileText = FC.CanonicalizeFile(InputFile, InputFileBuffer); + + SM.AddNewSourceBuffer(MemoryBuffer::getMemBuffer( + InputFileText, InputFile.getBufferIdentifier()), + SMLoc()); + + std::vector Diags; + int ExitCode = FC.checkInput(SM, InputFileText, + DumpInput == DumpInputNever ? nullptr : &Diags) + ? EXIT_SUCCESS + : 1; + if (DumpInput == DumpInputAlways || + (ExitCode == 1 && DumpInput == DumpInputFail)) { + errs() << "\n" + << "Input file: " << InputFilename << "\n" + << "Check file: " << CheckFilename << "\n" + << "\n" + << "-dump-input=help explains the following input dump.\n" + << "\n"; + std::vector Annotations; + unsigned LabelWidth; + BuildInputAnnotations(SM, CheckFileBufferID, ImpPatBufferIDRange, Diags, + Annotations, LabelWidth); + DumpAnnotatedInput(errs(), Req, DumpInputFilter, DumpInputContext, + InputFileText, Annotations, LabelWidth); + } + + return ExitCode; +}