diff --git a/centipede/BUILD b/centipede/BUILD index 37c0ded5..db261aea 100644 --- a/centipede/BUILD +++ b/centipede/BUILD @@ -363,13 +363,13 @@ cc_library( ":defs", ":feature", ":logging", + ":pc_info", ":remote_file", ":shard_reader", ":workdir", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", - "@com_google_absl//absl/strings", ], ) diff --git a/centipede/analyze_corpora.cc b/centipede/analyze_corpora.cc index 5f913242..6e3060ea 100644 --- a/centipede/analyze_corpora.cc +++ b/centipede/analyze_corpora.cc @@ -15,6 +15,7 @@ #include "./centipede/analyze_corpora.h" #include +#include #include #include #include @@ -23,7 +24,6 @@ #include "absl/container/flat_hash_set.h" #include "absl/log/check.h" #include "absl/log/log.h" -#include "absl/strings/str_cat.h" #include "./centipede/binary_info.h" #include "./centipede/control_flow.h" #include "./centipede/corpus.h" @@ -31,6 +31,7 @@ #include "./centipede/defs.h" #include "./centipede/feature.h" #include "./centipede/logging.h" +#include "./centipede/pc_info.h" #include "./centipede/remote_file.h" #include "./centipede/shard_reader.h" #include "./centipede/workdir.h" @@ -45,10 +46,9 @@ std::vector ReadCorpora(std::string_view binary_name, WorkDir workdir(std::string(workdir_path), std::string(binary_name), std::string(binary_hash), /*my_shard_index=*/0); std::vector corpus_paths; - RemoteGlobMatch(absl::StrCat(workdir.CorpusPathPrefix(), "*"), corpus_paths); + RemoteGlobMatch(workdir.CorpusFiles().AllShardsGlob(), corpus_paths); std::vector features_paths; - RemoteGlobMatch(absl::StrCat(workdir.FeaturesPathPrefix(), "*"), - features_paths); + RemoteGlobMatch(workdir.FeaturesFiles().AllShardsGlob(), features_paths); CHECK_EQ(corpus_paths.size(), features_paths.size()); std::vector corpus; diff --git a/centipede/centipede.cc b/centipede/centipede.cc index 41186992..123a9c96 100644 --- a/centipede/centipede.cc +++ b/centipede/centipede.cc @@ -130,10 +130,10 @@ Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks, } void Centipede::CorpusToFiles(const Environment &env, std::string_view dir) { - WorkDir wd{env}; + const auto corpus_files = WorkDir{env}.CorpusFiles(); for (size_t shard = 0; shard < env.total_shards; shard++) { auto reader = DefaultBlobFileReaderFactory(); - auto corpus_path = wd.CorpusPath(shard); + auto corpus_path = corpus_files.ShardPath(shard); reader->Open(corpus_path).IgnoreError(); // may not exist. absl::Span blob; size_t num_read = 0; @@ -160,8 +160,9 @@ void Centipede::CorpusFromFiles(const Environment &env, std::string_view dir) { // Iterate over all shards. size_t inputs_added = 0; size_t inputs_ignored = 0; + const auto corpus_files = WorkDir{env}.CorpusFiles(); for (size_t shard = 0; shard < env.total_shards; shard++) { - const std::string corpus_path = wd.CorpusPath(shard); + const std::string corpus_path = corpus_files.ShardPath(shard); size_t num_shard_bytes = 0; // Read the shard (if it exists), collect input hashes from it. absl::flat_hash_set existing_hashes; @@ -397,8 +398,8 @@ void Centipede::LoadShard(const Environment &load_env, size_t shard_index, // See serialize_shard_loads on why we may want to serialize shard loads. // TODO(kcc): remove serialize_shard_loads when LoadShards() uses less RAM. const WorkDir wd{load_env}; - const std::string corpus_path = wd.CorpusPath(shard_index); - const std::string features_path = wd.FeaturesPath(shard_index); + const std::string corpus_path = wd.CorpusFiles().ShardPath(shard_index); + const std::string features_path = wd.FeaturesFiles().ShardPath(shard_index); if (env_.serialize_shard_loads) { ABSL_CONST_INIT static absl::Mutex load_shard_mu{absl::kConstInit}; absl::MutexLock lock(&load_shard_mu); @@ -432,7 +433,7 @@ void Centipede::LoadAllShardsInRandomOrder(const Environment &load_env, void Centipede::Rerun(std::vector &to_rerun) { if (to_rerun.empty()) return; - auto features_file_path = wd_.FeaturesPath(env_.my_shard_index); + auto features_file_path = wd_.FeaturesFiles().ShardPath(env_.my_shard_index); auto features_file = DefaultBlobFileWriterFactory(); CHECK_OK(features_file->Open(features_file_path, "a")); @@ -581,7 +582,8 @@ void Centipede::MergeFromOtherCorpus(std::string_view merge_from_dir, CHECK_GE(new_corpus_size, initial_corpus_size); // Corpus can't shrink here. if (new_corpus_size > initial_corpus_size) { auto appender = DefaultBlobFileWriterFactory(); - CHECK_OK(appender->Open(wd_.CorpusPath(env_.my_shard_index), "a")); + CHECK_OK( + appender->Open(wd_.CorpusFiles().ShardPath(env_.my_shard_index), "a")); for (size_t idx = initial_corpus_size; idx < new_corpus_size; ++idx) { CHECK_OK(appender->Write(corpus_.Get(idx))); } @@ -600,7 +602,7 @@ void Centipede::ReloadAllShardsAndWriteDistilledCorpus() { // Save the distilled corpus to a file in workdir and possibly to a hashed // file in the first corpus dir passed in `--corpus_dir`. - const auto distill_to_path = wd_.DistilledCorpusPath(); + const auto distill_to_path = wd_.DistilledCorpusFiles().MyShardPath(); LOG(INFO) << "Distilling: shard: " << env_.my_shard_index << " output: " << distill_to_path << " " << " distilled size: " << corpus_.NumActive(); @@ -660,10 +662,12 @@ void Centipede::FuzzingLoop() { MergeFromOtherCorpus(env_.merge_from, env_.my_shard_index); } + auto corpus_path = wd_.CorpusFiles().ShardPath(env_.my_shard_index); auto corpus_file = DefaultBlobFileWriterFactory(); + CHECK_OK(corpus_file->Open(corpus_path, "a")); + auto features_path = wd_.FeaturesFiles().ShardPath(env_.my_shard_index); auto features_file = DefaultBlobFileWriterFactory(); - CHECK_OK(corpus_file->Open(wd_.CorpusPath(env_.my_shard_index), "a")); - CHECK_OK(features_file->Open(wd_.FeaturesPath(env_.my_shard_index), "a")); + CHECK_OK(features_file->Open(features_path, "a")); // Load seed corpus when there is no external corpus loaded. if (corpus_.NumTotal() == 0) LoadSeedInputs(); diff --git a/centipede/distill.cc b/centipede/distill.cc index 23a6bef0..8d6c198b 100644 --- a/centipede/distill.cc +++ b/centipede/distill.cc @@ -42,8 +42,8 @@ void DistillTask(const Environment &env, const std::vector &shard_indices) { const WorkDir wd{env}; std::string log_line = absl::StrCat("DISTILL[S.", env.my_shard_index, "]: "); - const auto corpus_path = wd.DistilledCorpusPath(); - const auto features_path = wd.DistilledFeaturesPath(); + const auto corpus_path = wd.DistilledCorpusFiles().MyShardPath(); + const auto features_path = wd.DistilledFeaturesFiles().MyShardPath(); LOG(INFO) << log_line << VV(env.total_shards) << VV(corpus_path) << VV(features_path); @@ -59,9 +59,11 @@ void DistillTask(const Environment &env, const size_t num_total_shards = shard_indices.size(); size_t num_shards_read = 0; size_t num_distilled_corpus_elements = 0; + const auto corpus_files = wd.CorpusFiles(); + const auto features_files = wd.FeaturesFiles(); for (size_t shard_idx : shard_indices) { - const std::string corpus_path = wd.CorpusPath(shard_idx); - const std::string features_path = wd.FeaturesPath(shard_idx); + const std::string corpus_path = corpus_files.ShardPath(shard_idx); + const std::string features_path = features_files.ShardPath(shard_idx); VLOG(2) << log_line << "reading shard " << shard_idx << " from:\n" << VV(corpus_path) << "\n" << VV(features_path); diff --git a/centipede/distill_test.cc b/centipede/distill_test.cc index c3e10614..cfe668cc 100644 --- a/centipede/distill_test.cc +++ b/centipede/distill_test.cc @@ -24,7 +24,6 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/flags/declare.h" -#include "absl/flags/flag.h" #include "absl/flags/reflection.h" #include "absl/log/check.h" #include "./centipede/blob_file.h" @@ -69,8 +68,8 @@ using InputVec = std::vector; void WriteToShard(const Environment &env, const TestCorpusRecord &record, size_t shard_index) { const WorkDir wd{env}; - auto corpus_path = wd.CorpusPath(shard_index); - auto features_path = wd.FeaturesPath(shard_index); + const auto corpus_path = wd.CorpusFiles().ShardPath(shard_index); + const auto features_path = wd.FeaturesFiles().ShardPath(shard_index); const auto corpus_appender = DefaultBlobFileWriterFactory(); const auto features_appender = DefaultBlobFileWriterFactory(); CHECK_OK(corpus_appender->Open(corpus_path, "a")); @@ -83,15 +82,16 @@ void WriteToShard(const Environment &env, const TestCorpusRecord &record, // Reads and returns the distilled corpus record from // `wd.DistilledCorpusPath()` and `wd.DistilledFeaturesPath()`. std::vector ReadFromDistilled(const WorkDir &wd) { - auto distilled_corpus_path = wd.DistilledCorpusPath(); - auto distilled_features_path = wd.DistilledFeaturesPath(); + const auto distilled_corpus_path = wd.DistilledCorpusFiles().MyShardPath(); + const auto distilled_features_path = + wd.DistilledFeaturesFiles().MyShardPath(); std::vector result; auto shard_reader_callback = [&result](const ByteArray &input, FeatureVec &features) { result.push_back({input, features}); }; - ReadShard(wd.DistilledCorpusPath(), wd.DistilledFeaturesPath(), + ReadShard(distilled_corpus_path, distilled_features_path, shard_reader_callback); return result; } diff --git a/centipede/workdir.cc b/centipede/workdir.cc index dc996776..2a582d21 100644 --- a/centipede/workdir.cc +++ b/centipede/workdir.cc @@ -45,6 +45,30 @@ std::string NormalizeAnnotation(std::string_view annotation) { } // namespace +//------------------------------------------------------------------------------ +// WorkDir::PathInfo + +WorkDir::ShardedFileInfo::ShardedFileInfo(std::string_view base_dir, + std::string_view rel_prefix, + size_t my_shard_index) + : prefix_{std::filesystem::path(base_dir) / rel_prefix}, + my_shard_index_{my_shard_index} {} + +std::string WorkDir::ShardedFileInfo::ShardPath(size_t shard_index) const { + return absl::StrFormat("%s%0*d", prefix_, kDigitsInShardIndex, shard_index); +} + +std::string WorkDir::ShardedFileInfo::MyShardPath() const { + return ShardPath(my_shard_index_); +} + +std::string WorkDir::ShardedFileInfo::AllShardsGlob() const { + return absl::StrCat(prefix_, "*"); +} + +//------------------------------------------------------------------------------ +// WorkDir + WorkDir::WorkDir( // std::string workdir, // std::string binary_name, // @@ -78,36 +102,23 @@ std::string WorkDir::BinaryInfoDirPath() const { return std::filesystem::path(CoverageDirPath()) / "binary-info"; } -std::string WorkDir::CorpusPathPrefix() const { - return std::filesystem::path(workdir_) / "corpus."; -} - -std::string WorkDir::CorpusPath(size_t shard_index) const { - return absl::StrCat( - CorpusPathPrefix(), - absl::StrFormat("%0*d", kDigitsInShardIndex, shard_index)); +WorkDir::ShardedFileInfo WorkDir::CorpusFiles() const { + return {workdir_, "corpus.", my_shard_index_}; } -std::string WorkDir::FeaturesPathPrefix() const { - return std::filesystem::path(CoverageDirPath()) / "features."; +WorkDir::ShardedFileInfo WorkDir::DistilledCorpusFiles() const { + return {workdir_, absl::StrCat("distilled-", binary_name_, "."), + my_shard_index_}; } -std::string WorkDir::FeaturesPath(size_t shard_index) const { - return absl::StrCat( - FeaturesPathPrefix(), - absl::StrFormat("%0*d", kDigitsInShardIndex, shard_index)); -} - -std::string WorkDir::DistilledCorpusPath() const { - return std::filesystem::path(workdir_) / - absl::StrFormat("distilled-%s.%0*d", binary_name_, kDigitsInShardIndex, - my_shard_index_); +WorkDir::ShardedFileInfo WorkDir::FeaturesFiles() const { + return {CoverageDirPath(), "features.", my_shard_index_}; } -std::string WorkDir::DistilledFeaturesPath() const { - return std::filesystem::path(CoverageDirPath()) - .append(absl::StrFormat("distilled-features-%s.%0*d", binary_name_, - kDigitsInShardIndex, my_shard_index_)); +WorkDir::ShardedFileInfo WorkDir::DistilledFeaturesFiles() const { + return {CoverageDirPath(), + absl::StrCat("distilled-features-", binary_name_, "."), + my_shard_index_}; } std::string WorkDir::CoverageReportPath(std::string_view annotation) const { diff --git a/centipede/workdir.h b/centipede/workdir.h index abdb36d5..049f5ec6 100644 --- a/centipede/workdir.h +++ b/centipede/workdir.h @@ -31,6 +31,26 @@ class WorkDir { // pad indices with 0's in output file names so the names are sorted by index. static constexpr int kDigitsInShardIndex = 6; + // Provides APIs for getting paths of a particular category of sharded files. + class ShardedFileInfo { + public: + // Returns the path of the shard file for `shard_index`. + std::string ShardPath(size_t shard_index) const; + // Returns the path of the shard file for `my_shard_index_`. + std::string MyShardPath() const; + // Returns a glob matching all the shard files. + std::string AllShardsGlob() const; + + private: + friend class WorkDir; + + ShardedFileInfo(std::string_view base_dir, std::string_view rel_prefix, + size_t my_shard_index); + + const std::string prefix_; + const size_t my_shard_index_; + }; + // Constructs an object from directly provided field values. WorkDir( // std::string workdir, // @@ -57,21 +77,14 @@ class WorkDir { // Returns the path where the BinaryInfo will be serialized within workdir. std::string BinaryInfoDirPath() const; - // Returns the path for a corpus file by its shard_index. - std::string CorpusPath(size_t shard_index) const; - std::string CorpusPath() const { return CorpusPath(my_shard_index_); } - // Returns the prefix of all corpus shards - std::string CorpusPathPrefix() const; - // Returns the path for the distilled corpus file for my_shard_index. - std::string DistilledCorpusPath() const; - - // Returns the path for a features file by its shard_index. - std::string FeaturesPath(size_t shard_index) const; - std::string FeaturesPath() const { return FeaturesPath(my_shard_index_); } - // Returns the prefix of all feature shards - std::string FeaturesPathPrefix() const; - // Returns the path for the distilled features file for my_shard_index. - std::string DistilledFeaturesPath() const; + // Returns the path info for the corpus files. + ShardedFileInfo CorpusFiles() const; + // Returns the path info for the distilled corpus files. + ShardedFileInfo DistilledCorpusFiles() const; + // Returns the path info for the features files. + ShardedFileInfo FeaturesFiles() const; + // Returns the path info for the distilled features files. + ShardedFileInfo DistilledFeaturesFiles() const; // Returns the path for the coverage report file for my_shard_index. // Non-default `annotation` becomes a part of the returned filename. diff --git a/centipede/workdir_test.cc b/centipede/workdir_test.cc index 6486d83f..f9ae7917 100644 --- a/centipede/workdir_test.cc +++ b/centipede/workdir_test.cc @@ -43,14 +43,30 @@ TEST(WorkDirTest, Main) { EXPECT_EQ(wd.CrashReproducerDirPath(), "/dir/crashes"); EXPECT_EQ(wd.BinaryInfoDirPath(), "/dir/bin-hash/binary-info"); - EXPECT_EQ(wd.CorpusPath(), "/dir/corpus.000003"); - EXPECT_EQ(wd.CorpusPath(7), "/dir/corpus.000007"); - EXPECT_EQ(wd.DistilledCorpusPath(), "/dir/distilled-bin.000003"); + EXPECT_EQ(wd.CorpusFiles().MyShardPath(), "/dir/corpus.000003"); + EXPECT_EQ(wd.CorpusFiles().ShardPath(7), "/dir/corpus.000007"); + EXPECT_EQ(wd.CorpusFiles().AllShardsGlob(), "/dir/corpus.*"); - EXPECT_EQ(wd.FeaturesPath(), "/dir/bin-hash/features.000003"); - EXPECT_EQ(wd.FeaturesPath(7), "/dir/bin-hash/features.000007"); - EXPECT_EQ(wd.DistilledFeaturesPath(), + EXPECT_EQ(wd.DistilledCorpusFiles().MyShardPath(), // + "/dir/distilled-bin.000003"); + EXPECT_EQ(wd.DistilledCorpusFiles().ShardPath(7), // + "/dir/distilled-bin.000007"); + EXPECT_EQ(wd.DistilledCorpusFiles().AllShardsGlob(), // + "/dir/distilled-bin.*"); + + EXPECT_EQ(wd.FeaturesFiles().MyShardPath(), // + "/dir/bin-hash/features.000003"); + EXPECT_EQ(wd.FeaturesFiles().ShardPath(7), // + "/dir/bin-hash/features.000007"); + EXPECT_EQ(wd.FeaturesFiles().AllShardsGlob(), // + "/dir/bin-hash/features.*"); + + EXPECT_EQ(wd.DistilledFeaturesFiles().MyShardPath(), // "/dir/bin-hash/distilled-features-bin.000003"); + EXPECT_EQ(wd.DistilledFeaturesFiles().ShardPath(7), // + "/dir/bin-hash/distilled-features-bin.000007"); + EXPECT_EQ(wd.DistilledFeaturesFiles().AllShardsGlob(), // + "/dir/bin-hash/distilled-features-bin.*"); EXPECT_EQ(wd.CoverageReportPath(), // "/dir/coverage-report-bin.000003.txt");