From 5aa871caef0c9d08a5c3a71ed893aae042a0fb33 Mon Sep 17 00:00:00 2001 From: Sergey Shevchenko Date: Fri, 3 Nov 2023 13:56:24 -0700 Subject: [PATCH] #Centipede Replace hardcoded num of digits in shard filenames with public constant from `WorkDir` PiperOrigin-RevId: 579297862 --- centipede/BUILD | 7 +- centipede/seed_corpus_maker.cc | 14 ++- centipede/seed_corpus_maker_flags.cc | 10 ++ centipede/seed_corpus_maker_flags.h | 2 + centipede/seed_corpus_maker_lib.cc | 181 +++++++++++++++++++-------- centipede/seed_corpus_maker_lib.h | 43 +++++-- 6 files changed, 193 insertions(+), 64 deletions(-) diff --git a/centipede/BUILD b/centipede/BUILD index 7d34b887f..deea48ea6 100644 --- a/centipede/BUILD +++ b/centipede/BUILD @@ -59,6 +59,7 @@ cc_binary( ":config_init", ":seed_corpus_maker_flags", ":seed_corpus_maker_lib", + ":util", "@com_google_absl//absl/flags:flag", ], ) @@ -998,16 +999,18 @@ cc_library( deps = [ ":blob_file", ":defs", + ":feature", ":logging", ":remote_file", ":seed_corpus_config_cc_proto", + ":shard_reader", + ":util", + ":workdir", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", "@com_google_absl//absl/random", - "@com_google_absl//absl/status", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/types:span", "@com_google_protobuf//:protobuf", ], ) diff --git a/centipede/seed_corpus_maker.cc b/centipede/seed_corpus_maker.cc index 5798f0575..b5d9d1ee6 100644 --- a/centipede/seed_corpus_maker.cc +++ b/centipede/seed_corpus_maker.cc @@ -13,17 +13,29 @@ // limitations under the License. #include +#include // NOLINT +#include #include "absl/flags/flag.h" #include "./centipede/config_init.h" #include "./centipede/seed_corpus_maker_flags.h" #include "./centipede/seed_corpus_maker_lib.h" +#include "./centipede/util.h" int main(int argc, char** argv) { (void)centipede::config::InitRuntime(argc, argv); + const std::string config = absl::GetFlag(FLAGS_config); + const std::string binary_path = absl::GetFlag(FLAGS_coverage_binary_path); + std::string binary_hash = absl::GetFlag(FLAGS_coverage_binary_hash); + if (binary_hash.empty() && !binary_path.empty()) { + binary_hash = centipede::HashOfFileContents(binary_path); + } + const std::string binary_name = std::filesystem::path{binary_path}.filename(); + const std::string override_out_dir = absl::GetFlag(FLAGS_override_out_dir); + centipede::GenerateSeedCorpusFromConfig( // - absl::GetFlag(FLAGS_config), absl::GetFlag(FLAGS_override_out_dir)); + config, binary_name, binary_hash, override_out_dir); return EXIT_SUCCESS; } diff --git a/centipede/seed_corpus_maker_flags.cc b/centipede/seed_corpus_maker_flags.cc index 879922569..cc8360430 100644 --- a/centipede/seed_corpus_maker_flags.cc +++ b/centipede/seed_corpus_maker_flags.cc @@ -26,6 +26,16 @@ ABSL_FLAG( "parent dir, if --config is a filename, or the current dir otherwise.\n" "Furthermore, `destination.dir_path` can be overridden by passing a " "non-empty --out_dir."); +ABSL_FLAG( + std::string, coverage_binary_path, "", + "The path of the binary from which coverage is to be collected. Can be " + "just the basename of the binary, but in that case --coverage_binary_hash " + "must also be provided."); +ABSL_FLAG( + std::string, coverage_binary_hash, "", + "If not-empty, this hash is used instead of the actual hash of the " + "contents of --coverage_binary_path. Use when the binary pointed at by " + "--coverage_binary_path is not actually available on disk."); ABSL_FLAG( std::string, override_out_dir, "", "If non-empty, overrides the `destination.dir_path` field in the resolved " diff --git a/centipede/seed_corpus_maker_flags.h b/centipede/seed_corpus_maker_flags.h index 3f76bc95c..a57f13969 100644 --- a/centipede/seed_corpus_maker_flags.h +++ b/centipede/seed_corpus_maker_flags.h @@ -20,6 +20,8 @@ #include "absl/flags/declare.h" ABSL_DECLARE_FLAG(std::string, config); +ABSL_DECLARE_FLAG(std::string, coverage_binary_path); +ABSL_DECLARE_FLAG(std::string, coverage_binary_hash); ABSL_DECLARE_FLAG(std::string, override_out_dir); #endif // THIRD_PARTY_CENTIPEDE_SEED_CORPUS_MAKER_FLAGS_H_ diff --git a/centipede/seed_corpus_maker_lib.cc b/centipede/seed_corpus_maker_lib.cc index fcbf50a25..5cb1ebe50 100644 --- a/centipede/seed_corpus_maker_lib.cc +++ b/centipede/seed_corpus_maker_lib.cc @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include // NOLINT @@ -29,21 +28,24 @@ #include #include #include +#include #include #include "absl/log/check.h" #include "absl/log/log.h" #include "absl/random/random.h" -#include "absl/status/status.h" #include "absl/strings/match.h" #include "absl/strings/str_format.h" #include "absl/strings/str_replace.h" -#include "absl/types/span.h" #include "./centipede/blob_file.h" #include "./centipede/defs.h" +#include "./centipede/feature.h" #include "./centipede/logging.h" #include "./centipede/remote_file.h" #include "./centipede/seed_corpus_config.pb.h" +#include "./centipede/shard_reader.h" +#include "./centipede/util.h" +#include "./centipede/workdir.h" #include "google/protobuf/text_format.h" // TODO(ussuri): Add unit tests. @@ -59,6 +61,9 @@ namespace centipede { namespace fs = std::filesystem; +using InputAndFeatures = std::pair; +using InputAndFeaturesVec = std::vector; + SeedCorpusConfig ResolveSeedCorpusConfig( // std::string_view config_spec, // std::string_view override_out_dir) { @@ -110,14 +115,21 @@ SeedCorpusConfig ResolveSeedCorpusConfig( // } } + if (config.destination().shard_index_digits() == 0) { + config.mutable_destination()->set_shard_index_digits( + WorkDir::kDigitsInShardIndex); + } + LOG(INFO) << "Resolved config:\n" << config.DebugString(); return config; } -void SampleSeedCorpusElementsFromSource( // - const SeedCorpusSource& source, // - std::vector& elements) { +void SampleSeedCorpusElementsFromSource( // + const SeedCorpusSource& source, // + std::string_view coverage_binary_name, // + std::string_view coverage_binary_hash, // + InputAndFeaturesVec& elements) { LOG(INFO) << "Reading/sampling seed corpus elements from source:\n" << source.DebugString(); @@ -157,35 +169,39 @@ void SampleSeedCorpusElementsFromSource( // // Read all the elements from the found corpus shard files. - std::vector src_elts; + InputAndFeaturesVec src_elts; + size_t num_non_empty_features = 0; for (const auto& corpus_fname : corpus_fnames) { - std::unique_ptr corpus_reader = - centipede::DefaultBlobFileReaderFactory(); - CHECK(corpus_reader != nullptr); - CHECK_OK(corpus_reader->Open(corpus_fname)) << VV(corpus_fname); - - absl::Status read_status; - size_t num_read_elts = 0; - while (true) { - absl::Span elt; - read_status = corpus_reader->Read(elt); - // Reached EOF - done with this shard. - if (absl::IsOutOfRange(read_status)) break; - CHECK_OK(read_status) - << "Failure reading elements from shard " << corpus_fname; - CHECK(!elt.empty()) << "Read empty element: " << VV(corpus_fname); - src_elts.emplace_back(elt.begin(), elt.end()); - ++num_read_elts; - } - - corpus_reader->Close().IgnoreError(); - - LOG(INFO) << "Read " << num_read_elts << " elements from shard " - << corpus_fname; + // NOTE: The deduced matching `features_fname` may not exist if the source + // corpus was generated for a coverage binary that is different from the one + // we need, but `ReadShard()` can tolerate that, passing empty `FeatureVec`s + // to the callback if that's the case. + const auto work_dir = WorkDir::FromCorpusShardPath( // + corpus_fname, coverage_binary_name, coverage_binary_hash); + const std::string features_fname = + work_dir.CorpusFiles().IsShardPath(corpus_fname) + ? work_dir.FeaturesFiles().MyShardPath() + : work_dir.DistilledCorpusFiles().IsShardPath(corpus_fname) + ? work_dir.DistilledFeaturesFiles().MyShardPath() + : ""; + size_t prev_src_elts_size = src_elts.size(); + size_t prev_num_non_empty_features = num_non_empty_features; + ReadShard(corpus_fname, features_fname, + [&src_elts, &num_non_empty_features](const ByteArray& input, + FeatureVec& features) { + num_non_empty_features += features.empty() ? 0 : 1; + src_elts.emplace_back(input, std::move(features)); + }); + LOG(INFO) << "Read " << (src_elts.size() - prev_src_elts_size) + << " elements with " + << (num_non_empty_features - prev_num_non_empty_features) + << " non-empty features from source shard:\n" + << VV(corpus_fname) << "\n" + << VV(features_fname); } - - LOG(INFO) << "Read " << src_elts.size() << " elements total from source " + LOG(INFO) << "Read total of " << src_elts.size() << " elements with " + << num_non_empty_features << " non-empty features from source " << source.dir_glob(); // Extract a sample of the elements of the size specified in @@ -219,8 +235,10 @@ void SampleSeedCorpusElementsFromSource( // } } -void WriteSeedCorpusElementsToDestination( // - const std::vector& elements, // +void WriteSeedCorpusElementsToDestination( // + const InputAndFeaturesVec& elements, // + std::string_view coverage_binary_name, // + std::string_view coverage_binary_hash, // const SeedCorpusDestination& destination) { LOG(INFO) << "Writing seed corpus elements to destination:\n" << destination.DebugString(); @@ -243,41 +261,92 @@ void WriteSeedCorpusElementsToDestination( // } // Write the elements to the shard files. - // TODO(b/295978603): Replace the 6 with `WorkdirMgr::kDigitsInShardIndex`. - const auto shard_index_digits = destination.shard_index_digits() > 0 - ? destination.shard_index_digits() - : 6; auto elt_it = elements.cbegin(); for (size_t s = 0; s < shard_sizes.size(); ++s) { // Generate the output shard's filename. + // TODO(ussuri): Use more of `WorkDir` APIs here (possibly extend them, + // and possibly retire `SeedCorpusDestination::shard_index_digits`). const std::string shard_idx = - absl::StrFormat("%0*d", shard_index_digits, s); - const std::string shard_rel_fname = + absl::StrFormat("%0*d", destination.shard_index_digits(), s); + const std::string corpus_rel_fname = absl::StrReplaceAll(destination.shard_rel_glob(), {{"*", shard_idx}}); - const std::string shard_fname = - fs::path{destination.dir_path()} / shard_rel_fname; + const std::string corpus_fname = + fs::path{destination.dir_path()} / corpus_rel_fname; + + const auto work_dir = WorkDir::FromCorpusShardPath( // + corpus_fname, coverage_binary_name, coverage_binary_hash); + + CHECK(corpus_fname == work_dir.CorpusFiles().MyShardPath() || + corpus_fname == work_dir.DistilledCorpusFiles().MyShardPath()) + << "Bad config: generated destination corpus filename '" << corpus_fname + << "' doesn't match one of two expected forms '" + << work_dir.CorpusFiles().MyShardPath() << "' or '" + << work_dir.DistilledCorpusFiles().MyShardPath() + << "'; make sure binary name in config matches explicitly passed '" + << coverage_binary_name << "'"; + + const std::string features_fname = + work_dir.CorpusFiles().IsShardPath(corpus_fname) + ? work_dir.FeaturesFiles().MyShardPath() + : work_dir.DistilledFeaturesFiles().MyShardPath(); + CHECK(!features_fname.empty()); + + LOG(INFO) << "Writing " << shard_sizes[s] + << " elements to destination shard:\n" + << VV(corpus_fname) << "\n" + << VV(features_fname); + + // Features files are always saved in a subdir of the workdir + // (== `destination.dir_path()` here), which might not exist yet, so we + // create it. Corpus files are saved in the workdir directly, but we also + // create it in case `destination.shard_rel_glob()` contains some dirs + // (not really intended for that, but the end-user may do that). + if (!corpus_fname.empty()) { + RemoteMkdir(fs::path{corpus_fname}.parent_path().string()); + } + if (!features_fname.empty()) { + RemoteMkdir(fs::path{features_fname}.parent_path().string()); + } + + // Create writers for the corpus and features shard files. - LOG(INFO) << "Writing " << shard_sizes[s] << " elements to " << shard_fname; + // TODO(ussuri): 1. Once the whole thing is a class, make + // `num_non_empty_features` a member and don't even create a features file + // if 0. 2. Wrap corpus/features writing in a similar API to `ReadShard()`. - // Open the shard's file. - std::unique_ptr corpus_writer = + const std::unique_ptr corpus_writer = centipede::DefaultBlobFileWriterFactory(); CHECK(corpus_writer != nullptr); - CHECK_OK(corpus_writer->Open(shard_fname, "w")) << VV(shard_fname); + CHECK_OK(corpus_writer->Open(corpus_fname, "w")) << VV(corpus_fname); + + const std::unique_ptr features_writer = + DefaultBlobFileWriterFactory(); + CHECK(features_writer != nullptr); + CHECK_OK(features_writer->Open(features_fname, "w")) << VV(features_fname); + + // Write the shard's elements to the corpus and features shard files. - // Write the shard's elements to the file. for (size_t e = 0, ee = shard_sizes[s]; e < ee; ++e) { CHECK(elt_it != elements.cend()); - CHECK_OK(corpus_writer->Write(*elt_it)) << VV(shard_fname); + const ByteArray& input = elt_it->first; + CHECK_OK(corpus_writer->Write(input)) << VV(corpus_fname); + const FeatureVec& features = elt_it->second; + if (!features.empty()) { + const ByteArray packed_features = PackFeaturesAndHash(input, features); + CHECK_OK(features_writer->Write(packed_features)) << VV(features_fname); + } ++elt_it; } - CHECK_OK(corpus_writer->Close()) << VV(shard_fname); + CHECK_OK(corpus_writer->Close()) << VV(corpus_fname); + CHECK_OK(features_writer->Close()) << VV(features_fname); } } -void GenerateSeedCorpusFromConfig( // - std::string_view config_spec, // +void GenerateSeedCorpusFromConfig( // + std::string_view config_spec, // + std::string_view coverage_binary_name, // + std::string_view coverage_binary_hash, // std::string_view override_out_dir) { const SeedCorpusConfig config = ResolveSeedCorpusConfig(config_spec, override_out_dir); @@ -290,14 +359,18 @@ void GenerateSeedCorpusFromConfig( // // Pre-create the destination dir early to catch possible misspellings etc. RemoteMkdir(config.destination().dir_path()); - std::vector elements; + InputAndFeaturesVec elements; + for (const auto& source : config.sources()) { - SampleSeedCorpusElementsFromSource(source, elements); + SampleSeedCorpusElementsFromSource( // + source, coverage_binary_name, coverage_binary_hash, elements); } LOG(INFO) << "Sampled " << elements.size() << " elements from " << config.sources_size() << " seed corpus source(s)"; - WriteSeedCorpusElementsToDestination(elements, config.destination()); + WriteSeedCorpusElementsToDestination( // + elements, coverage_binary_name, coverage_binary_hash, + config.destination()); LOG(INFO) << "Wrote " << elements.size() << " elements to seed corpus destination"; } diff --git a/centipede/seed_corpus_maker_lib.h b/centipede/seed_corpus_maker_lib.h index 9123f2325..27dca060b 100644 --- a/centipede/seed_corpus_maker_lib.h +++ b/centipede/seed_corpus_maker_lib.h @@ -16,9 +16,11 @@ #define THIRD_PARTY_CENTIPEDE_SEED_CORPUS_MAKER_LIB_H_ #include +#include #include #include "./centipede/defs.h" +#include "./centipede/feature.h" #include "./centipede/seed_corpus_config.pb.h" namespace centipede { @@ -37,15 +39,33 @@ SeedCorpusConfig ResolveSeedCorpusConfig( // // Extracts a sample of corpus elements from `source` and appends the results to // `elements`. `source` defines the locations of the corpus shards and the size // of the sample. -void SampleSeedCorpusElementsFromSource( // - const SeedCorpusSource& source, // - std::vector& elements); +// +// `coverage_binary_name` should be the basename of the coverage binary for +// which the seed corpus is to be created, and the `coverage_binary_hash` should +// be the hash of that binary. If a corpus shard file found in the source +// directory contains a matching features shard file in the +// - subdir, the matching features +// will be copied over to `elements`; otherwise, and empty `FeatureVec` will be +// used instead. +void SampleSeedCorpusElementsFromSource( // + const SeedCorpusSource& source, // + std::string_view coverage_binary_name, // + std::string_view coverage_binary_hash, // + std::vector>& elements); // Writes seed corpus `elements` to `destination`. Any previously existing // corpus shard files matching `destination.shard_glob()` will be deleted // before writing (even if writing subsequently fails). -void WriteSeedCorpusElementsToDestination( // - const std::vector& elements, // +// +// `coverage_binary_name` should be the basename of the coverage binary for +// which the seed corpus is to be created, and the `coverage_binary_hash` should +// be the hash of that binary. The features in each `FeatureVec` of the +// `elements` will be saved to a features shard file under +// - subdir of the destination. +void WriteSeedCorpusElementsToDestination( // + const std::vector>& elements, // + std::string_view coverage_binary_name, // + std::string_view coverage_binary_hash, // const SeedCorpusDestination& destination); // Reads and samples seed corpus elements from all the sources and writes the @@ -58,8 +78,17 @@ void WriteSeedCorpusElementsToDestination( // // (if `config_spec` is a verbatim string) as the base dir. // If `override_out_dir` is non-empty, it overrides `destination.dir_path` // specified in `config_spec`. -void GenerateSeedCorpusFromConfig( // - std::string_view config_spec, // +// +// `coverage_binary_name` should be the basename of the coverage binary for +// which the seed corpus is to be created, and the `coverage_binary_hash` should +// be the hash of that binary. The features matching each sampled source corpus +// element will be copied over from the +// - subdir of the source to the +// same subdir of the destination. +void GenerateSeedCorpusFromConfig( // + std::string_view config_spec, // + std::string_view coverage_binary_name, // + std::string_view coverage_binary_hash, // std::string_view override_out_dir = ""); } // namespace centipede