Skip to content

Commit

Permalink
#Centipede Rework & expand corpus & features path APIs in WorkDir
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 578024131
  • Loading branch information
ussuri authored and copybara-github committed Oct 31, 2023
1 parent 91fd32a commit 55cf0e9
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 70 deletions.
2 changes: 1 addition & 1 deletion centipede/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -363,13 +363,13 @@ cc_library(
":defs",
":feature",
":logging",
":pc_info",
":remote_file",
":shard_reader",
":workdir",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/log",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/strings",
],
)

Expand Down
8 changes: 4 additions & 4 deletions centipede/analyze_corpora.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "./centipede/analyze_corpora.h"

#include <algorithm>
#include <cstddef>
#include <string>
#include <string_view>
#include <utility>
Expand All @@ -23,14 +24,14 @@
#include "absl/container/flat_hash_set.h"
#include "absl/log/check.h"
#include "absl/log/log.h"
#include "absl/strings/str_cat.h"
#include "./centipede/binary_info.h"
#include "./centipede/control_flow.h"
#include "./centipede/corpus.h"
#include "./centipede/coverage.h"
#include "./centipede/defs.h"
#include "./centipede/feature.h"
#include "./centipede/logging.h"
#include "./centipede/pc_info.h"
#include "./centipede/remote_file.h"
#include "./centipede/shard_reader.h"
#include "./centipede/workdir.h"
Expand All @@ -45,10 +46,9 @@ std::vector<CorpusRecord> ReadCorpora(std::string_view binary_name,
WorkDir workdir(std::string(workdir_path), std::string(binary_name),
std::string(binary_hash), /*my_shard_index=*/0);
std::vector<std::string> corpus_paths;
RemoteGlobMatch(absl::StrCat(workdir.CorpusPathPrefix(), "*"), corpus_paths);
RemoteGlobMatch(workdir.CorpusFiles().AllShardsGlob(), corpus_paths);
std::vector<std::string> features_paths;
RemoteGlobMatch(absl::StrCat(workdir.FeaturesPathPrefix(), "*"),
features_paths);
RemoteGlobMatch(workdir.FeaturesFiles().AllShardsGlob(), features_paths);

CHECK_EQ(corpus_paths.size(), features_paths.size());
std::vector<CorpusRecord> corpus;
Expand Down
24 changes: 14 additions & 10 deletions centipede/centipede.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,10 @@ Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks,
}

void Centipede::CorpusToFiles(const Environment &env, std::string_view dir) {
WorkDir wd{env};
const auto corpus_files = WorkDir{env}.CorpusFiles();
for (size_t shard = 0; shard < env.total_shards; shard++) {
auto reader = DefaultBlobFileReaderFactory();
auto corpus_path = wd.CorpusPath(shard);
auto corpus_path = corpus_files.ShardPath(shard);
reader->Open(corpus_path).IgnoreError(); // may not exist.
absl::Span<uint8_t> blob;
size_t num_read = 0;
Expand All @@ -160,8 +160,9 @@ void Centipede::CorpusFromFiles(const Environment &env, std::string_view dir) {
// Iterate over all shards.
size_t inputs_added = 0;
size_t inputs_ignored = 0;
const auto corpus_files = WorkDir{env}.CorpusFiles();
for (size_t shard = 0; shard < env.total_shards; shard++) {
const std::string corpus_path = wd.CorpusPath(shard);
const std::string corpus_path = corpus_files.ShardPath(shard);
size_t num_shard_bytes = 0;
// Read the shard (if it exists), collect input hashes from it.
absl::flat_hash_set<std::string> existing_hashes;
Expand Down Expand Up @@ -397,8 +398,8 @@ void Centipede::LoadShard(const Environment &load_env, size_t shard_index,
// See serialize_shard_loads on why we may want to serialize shard loads.
// TODO(kcc): remove serialize_shard_loads when LoadShards() uses less RAM.
const WorkDir wd{load_env};
const std::string corpus_path = wd.CorpusPath(shard_index);
const std::string features_path = wd.FeaturesPath(shard_index);
const std::string corpus_path = wd.CorpusFiles().ShardPath(shard_index);
const std::string features_path = wd.FeaturesFiles().ShardPath(shard_index);
if (env_.serialize_shard_loads) {
ABSL_CONST_INIT static absl::Mutex load_shard_mu{absl::kConstInit};
absl::MutexLock lock(&load_shard_mu);
Expand Down Expand Up @@ -432,7 +433,7 @@ void Centipede::LoadAllShardsInRandomOrder(const Environment &load_env,

void Centipede::Rerun(std::vector<ByteArray> &to_rerun) {
if (to_rerun.empty()) return;
auto features_file_path = wd_.FeaturesPath(env_.my_shard_index);
auto features_file_path = wd_.FeaturesFiles().ShardPath(env_.my_shard_index);
auto features_file = DefaultBlobFileWriterFactory();
CHECK_OK(features_file->Open(features_file_path, "a"));

Expand Down Expand Up @@ -581,7 +582,8 @@ void Centipede::MergeFromOtherCorpus(std::string_view merge_from_dir,
CHECK_GE(new_corpus_size, initial_corpus_size); // Corpus can't shrink here.
if (new_corpus_size > initial_corpus_size) {
auto appender = DefaultBlobFileWriterFactory();
CHECK_OK(appender->Open(wd_.CorpusPath(env_.my_shard_index), "a"));
CHECK_OK(
appender->Open(wd_.CorpusFiles().ShardPath(env_.my_shard_index), "a"));
for (size_t idx = initial_corpus_size; idx < new_corpus_size; ++idx) {
CHECK_OK(appender->Write(corpus_.Get(idx)));
}
Expand All @@ -600,7 +602,7 @@ void Centipede::ReloadAllShardsAndWriteDistilledCorpus() {

// Save the distilled corpus to a file in workdir and possibly to a hashed
// file in the first corpus dir passed in `--corpus_dir`.
const auto distill_to_path = wd_.DistilledCorpusPath();
const auto distill_to_path = wd_.DistilledCorpusFiles().MyShardPath();
LOG(INFO) << "Distilling: shard: " << env_.my_shard_index
<< " output: " << distill_to_path << " "
<< " distilled size: " << corpus_.NumActive();
Expand Down Expand Up @@ -660,10 +662,12 @@ void Centipede::FuzzingLoop() {
MergeFromOtherCorpus(env_.merge_from, env_.my_shard_index);
}

auto corpus_path = wd_.CorpusFiles().ShardPath(env_.my_shard_index);
auto corpus_file = DefaultBlobFileWriterFactory();
CHECK_OK(corpus_file->Open(corpus_path, "a"));
auto features_path = wd_.FeaturesFiles().ShardPath(env_.my_shard_index);
auto features_file = DefaultBlobFileWriterFactory();
CHECK_OK(corpus_file->Open(wd_.CorpusPath(env_.my_shard_index), "a"));
CHECK_OK(features_file->Open(wd_.FeaturesPath(env_.my_shard_index), "a"));
CHECK_OK(features_file->Open(features_path, "a"));

// Load seed corpus when there is no external corpus loaded.
if (corpus_.NumTotal() == 0) LoadSeedInputs();
Expand Down
10 changes: 6 additions & 4 deletions centipede/distill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ void DistillTask(const Environment &env,
const std::vector<size_t> &shard_indices) {
const WorkDir wd{env};
std::string log_line = absl::StrCat("DISTILL[S.", env.my_shard_index, "]: ");
const auto corpus_path = wd.DistilledCorpusPath();
const auto features_path = wd.DistilledFeaturesPath();
const auto corpus_path = wd.DistilledCorpusFiles().MyShardPath();
const auto features_path = wd.DistilledFeaturesFiles().MyShardPath();
LOG(INFO) << log_line << VV(env.total_shards) << VV(corpus_path)
<< VV(features_path);

Expand All @@ -59,9 +59,11 @@ void DistillTask(const Environment &env,
const size_t num_total_shards = shard_indices.size();
size_t num_shards_read = 0;
size_t num_distilled_corpus_elements = 0;
const auto corpus_files = wd.CorpusFiles();
const auto features_files = wd.FeaturesFiles();
for (size_t shard_idx : shard_indices) {
const std::string corpus_path = wd.CorpusPath(shard_idx);
const std::string features_path = wd.FeaturesPath(shard_idx);
const std::string corpus_path = corpus_files.ShardPath(shard_idx);
const std::string features_path = features_files.ShardPath(shard_idx);
VLOG(2) << log_line << "reading shard " << shard_idx << " from:\n"
<< VV(corpus_path) << "\n"
<< VV(features_path);
Expand Down
12 changes: 6 additions & 6 deletions centipede/distill_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/flags/declare.h"
#include "absl/flags/flag.h"
#include "absl/flags/reflection.h"
#include "absl/log/check.h"
#include "./centipede/blob_file.h"
Expand Down Expand Up @@ -69,8 +68,8 @@ using InputVec = std::vector<ByteArray>;
void WriteToShard(const Environment &env, const TestCorpusRecord &record,
size_t shard_index) {
const WorkDir wd{env};
auto corpus_path = wd.CorpusPath(shard_index);
auto features_path = wd.FeaturesPath(shard_index);
const auto corpus_path = wd.CorpusFiles().ShardPath(shard_index);
const auto features_path = wd.FeaturesFiles().ShardPath(shard_index);
const auto corpus_appender = DefaultBlobFileWriterFactory();
const auto features_appender = DefaultBlobFileWriterFactory();
CHECK_OK(corpus_appender->Open(corpus_path, "a"));
Expand All @@ -83,15 +82,16 @@ void WriteToShard(const Environment &env, const TestCorpusRecord &record,
// Reads and returns the distilled corpus record from
// `wd.DistilledCorpusPath()` and `wd.DistilledFeaturesPath()`.
std::vector<TestCorpusRecord> ReadFromDistilled(const WorkDir &wd) {
auto distilled_corpus_path = wd.DistilledCorpusPath();
auto distilled_features_path = wd.DistilledFeaturesPath();
const auto distilled_corpus_path = wd.DistilledCorpusFiles().MyShardPath();
const auto distilled_features_path =
wd.DistilledFeaturesFiles().MyShardPath();

std::vector<TestCorpusRecord> result;
auto shard_reader_callback = [&result](const ByteArray &input,
FeatureVec &features) {
result.push_back({input, features});
};
ReadShard(wd.DistilledCorpusPath(), wd.DistilledFeaturesPath(),
ReadShard(distilled_corpus_path, distilled_features_path,
shard_reader_callback);
return result;
}
Expand Down
59 changes: 35 additions & 24 deletions centipede/workdir.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,30 @@ std::string NormalizeAnnotation(std::string_view annotation) {

} // namespace

//------------------------------------------------------------------------------
// WorkDir::PathInfo

WorkDir::ShardedFileInfo::ShardedFileInfo(std::string_view base_dir,
std::string_view rel_prefix,
size_t my_shard_index)
: prefix_{std::filesystem::path(base_dir) / rel_prefix},
my_shard_index_{my_shard_index} {}

std::string WorkDir::ShardedFileInfo::ShardPath(size_t shard_index) const {
return absl::StrFormat("%s%0*d", prefix_, kDigitsInShardIndex, shard_index);
}

std::string WorkDir::ShardedFileInfo::MyShardPath() const {
return ShardPath(my_shard_index_);
}

std::string WorkDir::ShardedFileInfo::AllShardsGlob() const {
return absl::StrCat(prefix_, "*");
}

//------------------------------------------------------------------------------
// WorkDir

WorkDir::WorkDir( //
std::string workdir, //
std::string binary_name, //
Expand Down Expand Up @@ -78,36 +102,23 @@ std::string WorkDir::BinaryInfoDirPath() const {
return std::filesystem::path(CoverageDirPath()) / "binary-info";
}

std::string WorkDir::CorpusPathPrefix() const {
return std::filesystem::path(workdir_) / "corpus.";
}

std::string WorkDir::CorpusPath(size_t shard_index) const {
return absl::StrCat(
CorpusPathPrefix(),
absl::StrFormat("%0*d", kDigitsInShardIndex, shard_index));
WorkDir::ShardedFileInfo WorkDir::CorpusFiles() const {
return {workdir_, "corpus.", my_shard_index_};
}

std::string WorkDir::FeaturesPathPrefix() const {
return std::filesystem::path(CoverageDirPath()) / "features.";
WorkDir::ShardedFileInfo WorkDir::DistilledCorpusFiles() const {
return {workdir_, absl::StrCat("distilled-", binary_name_, "."),
my_shard_index_};
}

std::string WorkDir::FeaturesPath(size_t shard_index) const {
return absl::StrCat(
FeaturesPathPrefix(),
absl::StrFormat("%0*d", kDigitsInShardIndex, shard_index));
}

std::string WorkDir::DistilledCorpusPath() const {
return std::filesystem::path(workdir_) /
absl::StrFormat("distilled-%s.%0*d", binary_name_, kDigitsInShardIndex,
my_shard_index_);
WorkDir::ShardedFileInfo WorkDir::FeaturesFiles() const {
return {CoverageDirPath(), "features.", my_shard_index_};
}

std::string WorkDir::DistilledFeaturesPath() const {
return std::filesystem::path(CoverageDirPath())
.append(absl::StrFormat("distilled-features-%s.%0*d", binary_name_,
kDigitsInShardIndex, my_shard_index_));
WorkDir::ShardedFileInfo WorkDir::DistilledFeaturesFiles() const {
return {CoverageDirPath(),
absl::StrCat("distilled-features-", binary_name_, "."),
my_shard_index_};
}

std::string WorkDir::CoverageReportPath(std::string_view annotation) const {
Expand Down
43 changes: 28 additions & 15 deletions centipede/workdir.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,26 @@ class WorkDir {
// pad indices with 0's in output file names so the names are sorted by index.
static constexpr int kDigitsInShardIndex = 6;

// Provides APIs for getting paths of a particular category of sharded files.
class ShardedFileInfo {
public:
// Returns the path of the shard file for `shard_index`.
std::string ShardPath(size_t shard_index) const;
// Returns the path of the shard file for `my_shard_index_`.
std::string MyShardPath() const;
// Returns a glob matching all the shard files.
std::string AllShardsGlob() const;

private:
friend class WorkDir;

ShardedFileInfo(std::string_view base_dir, std::string_view rel_prefix,
size_t my_shard_index);

const std::string prefix_;
const size_t my_shard_index_;
};

// Constructs an object from directly provided field values.
WorkDir( //
std::string workdir, //
Expand All @@ -57,21 +77,14 @@ class WorkDir {
// Returns the path where the BinaryInfo will be serialized within workdir.
std::string BinaryInfoDirPath() const;

// Returns the path for a corpus file by its shard_index.
std::string CorpusPath(size_t shard_index) const;
std::string CorpusPath() const { return CorpusPath(my_shard_index_); }
// Returns the prefix of all corpus shards
std::string CorpusPathPrefix() const;
// Returns the path for the distilled corpus file for my_shard_index.
std::string DistilledCorpusPath() const;

// Returns the path for a features file by its shard_index.
std::string FeaturesPath(size_t shard_index) const;
std::string FeaturesPath() const { return FeaturesPath(my_shard_index_); }
// Returns the prefix of all feature shards
std::string FeaturesPathPrefix() const;
// Returns the path for the distilled features file for my_shard_index.
std::string DistilledFeaturesPath() const;
// Returns the path info for the corpus files.
ShardedFileInfo CorpusFiles() const;
// Returns the path info for the distilled corpus files.
ShardedFileInfo DistilledCorpusFiles() const;
// Returns the path info for the features files.
ShardedFileInfo FeaturesFiles() const;
// Returns the path info for the distilled features files.
ShardedFileInfo DistilledFeaturesFiles() const;

// Returns the path for the coverage report file for my_shard_index.
// Non-default `annotation` becomes a part of the returned filename.
Expand Down
28 changes: 22 additions & 6 deletions centipede/workdir_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,30 @@ TEST(WorkDirTest, Main) {
EXPECT_EQ(wd.CrashReproducerDirPath(), "/dir/crashes");
EXPECT_EQ(wd.BinaryInfoDirPath(), "/dir/bin-hash/binary-info");

EXPECT_EQ(wd.CorpusPath(), "/dir/corpus.000003");
EXPECT_EQ(wd.CorpusPath(7), "/dir/corpus.000007");
EXPECT_EQ(wd.DistilledCorpusPath(), "/dir/distilled-bin.000003");
EXPECT_EQ(wd.CorpusFiles().MyShardPath(), "/dir/corpus.000003");
EXPECT_EQ(wd.CorpusFiles().ShardPath(7), "/dir/corpus.000007");
EXPECT_EQ(wd.CorpusFiles().AllShardsGlob(), "/dir/corpus.*");

EXPECT_EQ(wd.FeaturesPath(), "/dir/bin-hash/features.000003");
EXPECT_EQ(wd.FeaturesPath(7), "/dir/bin-hash/features.000007");
EXPECT_EQ(wd.DistilledFeaturesPath(),
EXPECT_EQ(wd.DistilledCorpusFiles().MyShardPath(), //
"/dir/distilled-bin.000003");
EXPECT_EQ(wd.DistilledCorpusFiles().ShardPath(7), //
"/dir/distilled-bin.000007");
EXPECT_EQ(wd.DistilledCorpusFiles().AllShardsGlob(), //
"/dir/distilled-bin.*");

EXPECT_EQ(wd.FeaturesFiles().MyShardPath(), //
"/dir/bin-hash/features.000003");
EXPECT_EQ(wd.FeaturesFiles().ShardPath(7), //
"/dir/bin-hash/features.000007");
EXPECT_EQ(wd.FeaturesFiles().AllShardsGlob(), //
"/dir/bin-hash/features.*");

EXPECT_EQ(wd.DistilledFeaturesFiles().MyShardPath(), //
"/dir/bin-hash/distilled-features-bin.000003");
EXPECT_EQ(wd.DistilledFeaturesFiles().ShardPath(7), //
"/dir/bin-hash/distilled-features-bin.000007");
EXPECT_EQ(wd.DistilledFeaturesFiles().AllShardsGlob(), //
"/dir/bin-hash/distilled-features-bin.*");

EXPECT_EQ(wd.CoverageReportPath(), //
"/dir/coverage-report-bin.000003.txt");
Expand Down

0 comments on commit 55cf0e9

Please sign in to comment.