Skip to content

Commit

Permalink
#Centipede Rework & expand some path APIs in WorkDir
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 577023560
  • Loading branch information
ussuri authored and copybara-github committed Oct 27, 2023
1 parent 93db234 commit e700aba
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 71 deletions.
2 changes: 1 addition & 1 deletion centipede/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -351,13 +351,13 @@ cc_library(
":defs",
":feature",
":logging",
":pc_info",
":remote_file",
":shard_reader",
":workdir",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/log",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/strings",
],
)

Expand Down
8 changes: 4 additions & 4 deletions centipede/analyze_corpora.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "./centipede/analyze_corpora.h"

#include <algorithm>
#include <cstddef>
#include <string>
#include <string_view>
#include <utility>
Expand All @@ -23,14 +24,14 @@
#include "absl/container/flat_hash_set.h"
#include "absl/log/check.h"
#include "absl/log/log.h"
#include "absl/strings/str_cat.h"
#include "./centipede/binary_info.h"
#include "./centipede/control_flow.h"
#include "./centipede/corpus.h"
#include "./centipede/coverage.h"
#include "./centipede/defs.h"
#include "./centipede/feature.h"
#include "./centipede/logging.h"
#include "./centipede/pc_info.h"
#include "./centipede/remote_file.h"
#include "./centipede/shard_reader.h"
#include "./centipede/workdir.h"
Expand All @@ -45,10 +46,9 @@ std::vector<CorpusRecord> ReadCorpora(std::string_view binary_name,
WorkDir workdir(std::string(workdir_path), std::string(binary_name),
std::string(binary_hash), /*my_shard_index=*/0);
std::vector<std::string> corpus_paths;
RemoteGlobMatch(absl::StrCat(workdir.CorpusPathPrefix(), "*"), corpus_paths);
RemoteGlobMatch(workdir.CorpusPath().Glob(), corpus_paths);
std::vector<std::string> features_paths;
RemoteGlobMatch(absl::StrCat(workdir.FeaturesPathPrefix(), "*"),
features_paths);
RemoteGlobMatch(workdir.FeaturesPath().Glob(), features_paths);

CHECK_EQ(corpus_paths.size(), features_paths.size());
std::vector<CorpusRecord> corpus;
Expand Down
23 changes: 12 additions & 11 deletions centipede/centipede.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,10 @@ Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks,

void Centipede::SaveCorpusToLocalDir(
const Environment &env, std::string_view save_corpus_to_local_dir) {
const WorkDir wd{env};
const auto corpus_path_info = WorkDir{env}.CorpusPath();
for (size_t shard = 0; shard < env.total_shards; shard++) {
auto reader = DefaultBlobFileReaderFactory();
auto corpus_path = wd.CorpusPath(shard);
auto corpus_path = corpus_path_info.Shard(shard);
reader->Open(corpus_path).IgnoreError(); // may not exist.
absl::Span<uint8_t> blob;
size_t num_read = 0;
Expand All @@ -148,7 +148,6 @@ void Centipede::SaveCorpusToLocalDir(

void Centipede::ExportCorpusFromLocalDir(const Environment &env,
std::string_view local_dir) {
const WorkDir wd{env};
// Shard the file paths in `local_dir` based on hashes of filenames.
// Such partition is stable: a given file always goes to a specific shard.
std::vector<std::vector<std::string>> sharded_paths(env.total_shards);
Expand All @@ -164,8 +163,9 @@ void Centipede::ExportCorpusFromLocalDir(const Environment &env,
// Iterate over all shards.
size_t inputs_added = 0;
size_t inputs_ignored = 0;
const auto corpus_path_info = WorkDir{env}.CorpusPath();
for (size_t shard = 0; shard < env.total_shards; shard++) {
const std::string corpus_path = wd.CorpusPath(shard);
const std::string corpus_path = corpus_path_info.Shard(shard);
size_t num_shard_bytes = 0;
// Read the shard (if it exists), collect input hashes from it.
absl::flat_hash_set<std::string> existing_hashes;
Expand Down Expand Up @@ -401,8 +401,8 @@ void Centipede::LoadShard(const Environment &load_env, size_t shard_index,
// See serialize_shard_loads on why we may want to serialize shard loads.
// TODO(kcc): remove serialize_shard_loads when LoadShards() uses less RAM.
const WorkDir wd{load_env};
const std::string corpus_path = wd.CorpusPath(shard_index);
const std::string features_path = wd.FeaturesPath(shard_index);
const std::string corpus_path = wd.CorpusPath().Shard(shard_index);
const std::string features_path = wd.FeaturesPath().Shard(shard_index);
if (env_.serialize_shard_loads) {
ABSL_CONST_INIT static absl::Mutex load_shard_mu{absl::kConstInit};
absl::MutexLock lock(&load_shard_mu);
Expand Down Expand Up @@ -436,7 +436,7 @@ void Centipede::LoadAllShardsInRandomOrder(const Environment &load_env,

void Centipede::Rerun(std::vector<ByteArray> &to_rerun) {
if (to_rerun.empty()) return;
auto features_file_path = wd_.FeaturesPath(env_.my_shard_index);
auto features_file_path = wd_.FeaturesPath().Shard(env_.my_shard_index);
auto features_file = DefaultBlobFileWriterFactory();
CHECK_OK(features_file->Open(features_file_path, "a"));

Expand Down Expand Up @@ -585,7 +585,7 @@ void Centipede::MergeFromOtherCorpus(std::string_view merge_from_dir,
CHECK_GE(new_corpus_size, initial_corpus_size); // Corpus can't shrink here.
if (new_corpus_size > initial_corpus_size) {
auto appender = DefaultBlobFileWriterFactory();
CHECK_OK(appender->Open(wd_.CorpusPath(env_.my_shard_index), "a"));
CHECK_OK(appender->Open(wd_.CorpusPath().Shard(env_.my_shard_index), "a"));
for (size_t idx = initial_corpus_size; idx < new_corpus_size; ++idx) {
CHECK_OK(appender->Write(corpus_.Get(idx)));
}
Expand All @@ -604,7 +604,7 @@ void Centipede::ReloadAllShardsAndWriteDistilledCorpus() {

// Save the distilled corpus to a file in workdir and possibly to a hashed
// file in the first corpus dir passed in `--corpus_dir`.
const auto distill_to_path = wd_.DistilledCorpusPath();
const auto distill_to_path = wd_.DistilledCorpusPath().MyShard();
LOG(INFO) << "Distilling: shard: " << env_.my_shard_index
<< " output: " << distill_to_path << " "
<< " distilled size: " << corpus_.NumActive();
Expand Down Expand Up @@ -666,8 +666,9 @@ void Centipede::FuzzingLoop() {

auto corpus_file = DefaultBlobFileWriterFactory();
auto features_file = DefaultBlobFileWriterFactory();
CHECK_OK(corpus_file->Open(wd_.CorpusPath(env_.my_shard_index), "a"));
CHECK_OK(features_file->Open(wd_.FeaturesPath(env_.my_shard_index), "a"));
CHECK_OK(corpus_file->Open(wd_.CorpusPath().Shard(env_.my_shard_index), "a"));
CHECK_OK(
features_file->Open(wd_.FeaturesPath().Shard(env_.my_shard_index), "a"));

// Load seed corpus when there is no external corpus loaded.
if (corpus_.NumTotal() == 0) LoadSeedInputs();
Expand Down
10 changes: 6 additions & 4 deletions centipede/distill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ void DistillTask(const Environment &env,
const std::vector<size_t> &shard_indices) {
const WorkDir wd{env};
std::string log_line = absl::StrCat("DISTILL[S.", env.my_shard_index, "]: ");
const auto corpus_path = wd.DistilledCorpusPath();
const auto features_path = wd.DistilledFeaturesPath();
const auto corpus_path = wd.DistilledCorpusPath().MyShard();
const auto features_path = wd.DistilledFeaturesPath().MyShard();
LOG(INFO) << log_line << VV(env.total_shards) << VV(corpus_path)
<< VV(features_path);

Expand All @@ -59,9 +59,11 @@ void DistillTask(const Environment &env,
const size_t num_total_shards = shard_indices.size();
size_t num_shards_read = 0;
size_t num_distilled_corpus_elements = 0;
const auto corpus_path_info = wd.CorpusPath();
const auto features_path_info = wd.FeaturesPath();
for (size_t shard_idx : shard_indices) {
const std::string corpus_path = wd.CorpusPath(shard_idx);
const std::string features_path = wd.FeaturesPath(shard_idx);
const std::string corpus_path = corpus_path_info.Shard(shard_idx);
const std::string features_path = features_path_info.Shard(shard_idx);
VLOG(2) << log_line << "reading shard " << shard_idx << " from:\n"
<< VV(corpus_path) << "\n"
<< VV(features_path);
Expand Down
11 changes: 5 additions & 6 deletions centipede/distill_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/flags/declare.h"
#include "absl/flags/flag.h"
#include "absl/flags/reflection.h"
#include "absl/log/check.h"
#include "./centipede/blob_file.h"
Expand Down Expand Up @@ -69,8 +68,8 @@ using InputVec = std::vector<ByteArray>;
void WriteToShard(const Environment &env, const TestCorpusRecord &record,
size_t shard_index) {
const WorkDir wd{env};
auto corpus_path = wd.CorpusPath(shard_index);
auto features_path = wd.FeaturesPath(shard_index);
const auto corpus_path = wd.CorpusPath().Shard(shard_index);
const auto features_path = wd.FeaturesPath().Shard(shard_index);
const auto corpus_appender = DefaultBlobFileWriterFactory();
const auto features_appender = DefaultBlobFileWriterFactory();
CHECK_OK(corpus_appender->Open(corpus_path, "a"));
Expand All @@ -83,15 +82,15 @@ void WriteToShard(const Environment &env, const TestCorpusRecord &record,
// Reads and returns the distilled corpus record from
// `wd.DistilledCorpusPath()` and `wd.DistilledFeaturesPath()`.
std::vector<TestCorpusRecord> ReadFromDistilled(const WorkDir &wd) {
auto distilled_corpus_path = wd.DistilledCorpusPath();
auto distilled_features_path = wd.DistilledFeaturesPath();
const auto distilled_corpus_path = wd.DistilledCorpusPath().MyShard();
const auto distilled_features_path = wd.DistilledFeaturesPath().MyShard();

std::vector<TestCorpusRecord> result;
auto shard_reader_callback = [&result](const ByteArray &input,
FeatureVec &features) {
result.push_back({input, features});
};
ReadShard(wd.DistilledCorpusPath(), wd.DistilledFeaturesPath(),
ReadShard(distilled_corpus_path, distilled_features_path,
shard_reader_callback);
return result;
}
Expand Down
58 changes: 34 additions & 24 deletions centipede/workdir.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,29 @@ std::string NormalizeAnnotation(std::string_view annotation) {

} // namespace

//------------------------------------------------------------------------------
// WorkDir::PathInfo

WorkDir::PathInfo::PathInfo(std::string base_dir, std::string rel_prefix,
size_t my_shard_index)
: prefix_{std::filesystem::path(base_dir) / rel_prefix},
my_shard_index_{my_shard_index} {}

std::string WorkDir::PathInfo::Shard(size_t shard_index) const {
return absl::StrFormat("%s%0*d", prefix_, kDigitsInShardIndex, shard_index);
}

std::string WorkDir::PathInfo::MyShard() const {
return Shard(my_shard_index_);
}

std::string WorkDir::PathInfo::Glob() const {
return absl::StrCat(prefix_, "*");
}

//------------------------------------------------------------------------------
// WorkDir

WorkDir::WorkDir( //
std::string workdir, //
std::string binary_name, //
Expand Down Expand Up @@ -78,36 +101,23 @@ std::string WorkDir::BinaryInfoDirPath() const {
return std::filesystem::path(CoverageDirPath()) / "binary-info";
}

std::string WorkDir::CorpusPathPrefix() const {
return std::filesystem::path(workdir_) / "corpus.";
}

std::string WorkDir::CorpusPath(size_t shard_index) const {
return absl::StrCat(
CorpusPathPrefix(),
absl::StrFormat("%0*d", kDigitsInShardIndex, shard_index));
WorkDir::PathInfo WorkDir::CorpusPath() const {
return {workdir_, "corpus.", my_shard_index_};
}

std::string WorkDir::FeaturesPathPrefix() const {
return std::filesystem::path(CoverageDirPath()) / "features.";
WorkDir::PathInfo WorkDir::DistilledCorpusPath() const {
return {workdir_, absl::StrCat("distilled-", binary_name_, "."),
my_shard_index_};
}

std::string WorkDir::FeaturesPath(size_t shard_index) const {
return absl::StrCat(
FeaturesPathPrefix(),
absl::StrFormat("%0*d", kDigitsInShardIndex, shard_index));
}

std::string WorkDir::DistilledCorpusPath() const {
return std::filesystem::path(workdir_) /
absl::StrFormat("distilled-%s.%0*d", binary_name_, kDigitsInShardIndex,
my_shard_index_);
WorkDir::PathInfo WorkDir::FeaturesPath() const {
return {CoverageDirPath(), "features.", my_shard_index_};
}

std::string WorkDir::DistilledFeaturesPath() const {
return std::filesystem::path(CoverageDirPath())
.append(absl::StrFormat("distilled-features-%s.%0*d", binary_name_,
kDigitsInShardIndex, my_shard_index_));
WorkDir::PathInfo WorkDir::DistilledFeaturesPath() const {
return {CoverageDirPath(),
absl::StrCat("distilled-features-", binary_name_, "."),
my_shard_index_};
}

std::string WorkDir::CoverageReportPath(std::string_view annotation) const {
Expand Down
46 changes: 31 additions & 15 deletions centipede/workdir.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,24 @@ class WorkDir {
// pad indices with 0's in output file names so the names are sorted by index.
static constexpr int kDigitsInShardIndex = 6;

class PathInfo {
public:
// Returns the path of the file for `shard_index`.
std::string Shard(size_t shard_index) const;
// Returns the path of the file for `my_shard_index_`.
std::string MyShard() const;
// Returns a glob matching all the corpus shards.
std::string Glob() const;

private:
friend class WorkDir;
PathInfo(std::string base_dir, std::string rel_prefix,
size_t my_shard_index);

const std::string prefix_;
const size_t my_shard_index_;
};

// Constructs an object from directly provided field values.
WorkDir( //
std::string workdir, //
Expand All @@ -57,21 +75,14 @@ class WorkDir {
// Returns the path where the BinaryInfo will be serialized within workdir.
std::string BinaryInfoDirPath() const;

// Returns the path for a corpus file by its shard_index.
std::string CorpusPath(size_t shard_index) const;
std::string CorpusPath() const { return CorpusPath(my_shard_index_); }
// Returns the prefix of all corpus shards
std::string CorpusPathPrefix() const;
// Returns the path for the distilled corpus file for my_shard_index.
std::string DistilledCorpusPath() const;

// Returns the path for a features file by its shard_index.
std::string FeaturesPath(size_t shard_index) const;
std::string FeaturesPath() const { return FeaturesPath(my_shard_index_); }
// Returns the prefix of all feature shards
std::string FeaturesPathPrefix() const;
// Returns the path for the distilled features file for my_shard_index.
std::string DistilledFeaturesPath() const;
// Returns the path info for the corpus files.
PathInfo CorpusPath() const;
// Returns the path info for the distilled corpus files.
PathInfo DistilledCorpusPath() const;
// Returns the path info for the features files.
PathInfo FeaturesPath() const;
// Returns the path info for the distilled features files.
PathInfo DistilledFeaturesPath() const;

// Returns the path for the coverage report file for my_shard_index.
// Non-default `annotation` becomes a part of the returned filename.
Expand Down Expand Up @@ -102,6 +113,11 @@ class WorkDir {
std::string RUsageReportPath(std::string_view annotation = "") const;

private:
std::string CorpusPrefix() const;
std::string DistilledCorpusPrefix() const;
std::string FeaturesPrefix() const;
std::string DistilledFeaturesPrefix() const;

// Internal value holders for when the object is constructed from direct
// values rather than an `Environment` object.
std::string workdir_holder_;
Expand Down
20 changes: 14 additions & 6 deletions centipede/workdir_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,22 @@ TEST(WorkDirTest, Main) {
EXPECT_EQ(wd.CrashReproducerDirPath(), "/dir/crashes");
EXPECT_EQ(wd.BinaryInfoDirPath(), "/dir/bin-hash/binary-info");

EXPECT_EQ(wd.CorpusPath(), "/dir/corpus.000003");
EXPECT_EQ(wd.CorpusPath(7), "/dir/corpus.000007");
EXPECT_EQ(wd.DistilledCorpusPath(), "/dir/distilled-bin.000003");
EXPECT_EQ(wd.CorpusPath().MyShard(), "/dir/corpus.000003");
EXPECT_EQ(wd.CorpusPath().Shard(7), "/dir/corpus.000007");
EXPECT_EQ(wd.CorpusPath().Glob(), "/dir/corpus.*");
EXPECT_EQ(wd.DistilledCorpusPath().MyShard(), "/dir/distilled-bin.000003");
EXPECT_EQ(wd.DistilledCorpusPath().Shard(7), "/dir/distilled-bin.000007");
EXPECT_EQ(wd.DistilledCorpusPath().Glob(), "/dir/distilled-bin.*");

EXPECT_EQ(wd.FeaturesPath(), "/dir/bin-hash/features.000003");
EXPECT_EQ(wd.FeaturesPath(7), "/dir/bin-hash/features.000007");
EXPECT_EQ(wd.DistilledFeaturesPath(),
EXPECT_EQ(wd.FeaturesPath().MyShard(), "/dir/bin-hash/features.000003");
EXPECT_EQ(wd.FeaturesPath().Shard(7), "/dir/bin-hash/features.000007");
EXPECT_EQ(wd.FeaturesPath().Glob(), "/dir/bin-hash/features.*");
EXPECT_EQ(wd.DistilledFeaturesPath().MyShard(),
"/dir/bin-hash/distilled-features-bin.000003");
EXPECT_EQ(wd.DistilledFeaturesPath().Shard(7),
"/dir/bin-hash/distilled-features-bin.000007");
EXPECT_EQ(wd.DistilledFeaturesPath().Glob(),
"/dir/bin-hash/distilled-features-bin.*");

EXPECT_EQ(wd.CoverageReportPath(), //
"/dir/coverage-report-bin.000003.txt");
Expand Down

0 comments on commit e700aba

Please sign in to comment.