Skip to content

Commit

Permalink
#Centipede Amend seeder to also copy features for target binary from …
Browse files Browse the repository at this point in the history
…seeding sources, if available

PiperOrigin-RevId: 579340810
  • Loading branch information
ussuri authored and copybara-github committed Nov 3, 2023
1 parent afc0b4f commit 73ea91b
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 59 deletions.
7 changes: 5 additions & 2 deletions centipede/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ cc_binary(
":config_init",
":seed_corpus_maker_flags",
":seed_corpus_maker_lib",
":util",
"@com_google_absl//absl/flags:flag",
],
)
Expand Down Expand Up @@ -998,16 +999,18 @@ cc_library(
deps = [
":blob_file",
":defs",
":feature",
":logging",
":remote_file",
":seed_corpus_config_cc_proto",
":shard_reader",
":util",
":workdir",
"@com_google_absl//absl/log",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/random",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/types:span",
"@com_google_protobuf//:protobuf",
],
)
Expand Down
14 changes: 13 additions & 1 deletion centipede/seed_corpus_maker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,29 @@
// limitations under the License.

#include <cstdlib>
#include <filesystem> // NOLINT
#include <string>

#include "absl/flags/flag.h"
#include "./centipede/config_init.h"
#include "./centipede/seed_corpus_maker_flags.h"
#include "./centipede/seed_corpus_maker_lib.h"
#include "./centipede/util.h"

int main(int argc, char** argv) {
(void)centipede::config::InitRuntime(argc, argv);

const std::string config = absl::GetFlag(FLAGS_config);
const std::string binary_path = absl::GetFlag(FLAGS_coverage_binary_path);
std::string binary_hash = absl::GetFlag(FLAGS_coverage_binary_hash);
if (binary_hash.empty() && !binary_path.empty()) {
binary_hash = centipede::HashOfFileContents(binary_path);
}
const std::string binary_name = std::filesystem::path{binary_path}.filename();
const std::string override_out_dir = absl::GetFlag(FLAGS_override_out_dir);

centipede::GenerateSeedCorpusFromConfig( //
absl::GetFlag(FLAGS_config), absl::GetFlag(FLAGS_override_out_dir));
config, binary_name, binary_hash, override_out_dir);

return EXIT_SUCCESS;
}
10 changes: 10 additions & 0 deletions centipede/seed_corpus_maker_flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ ABSL_FLAG(
"parent dir, if --config is a filename, or the current dir otherwise.\n"
"Furthermore, `destination.dir_path` can be overridden by passing a "
"non-empty --out_dir.");
ABSL_FLAG(
std::string, coverage_binary_path, "",
"The path of the binary from which coverage is to be collected. Can be "
"just the basename of the binary, but in that case --coverage_binary_hash "
"must also be provided.");
ABSL_FLAG(
std::string, coverage_binary_hash, "",
"If not-empty, this hash is used instead of the actual hash of the "
"contents of --coverage_binary_path. Use when the binary pointed at by "
"--coverage_binary_path is not actually available on disk.");
ABSL_FLAG(
std::string, override_out_dir, "",
"If non-empty, overrides the `destination.dir_path` field in the resolved "
Expand Down
2 changes: 2 additions & 0 deletions centipede/seed_corpus_maker_flags.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include "absl/flags/declare.h"

ABSL_DECLARE_FLAG(std::string, config);
ABSL_DECLARE_FLAG(std::string, coverage_binary_path);
ABSL_DECLARE_FLAG(std::string, coverage_binary_hash);
ABSL_DECLARE_FLAG(std::string, override_out_dir);

#endif // THIRD_PARTY_CENTIPEDE_SEED_CORPUS_MAKER_FLAGS_H_
170 changes: 121 additions & 49 deletions centipede/seed_corpus_maker_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <filesystem> // NOLINT
Expand All @@ -29,21 +28,24 @@
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "absl/log/check.h"
#include "absl/log/log.h"
#include "absl/random/random.h"
#include "absl/status/status.h"
#include "absl/strings/match.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_replace.h"
#include "absl/types/span.h"
#include "./centipede/blob_file.h"
#include "./centipede/defs.h"
#include "./centipede/feature.h"
#include "./centipede/logging.h"
#include "./centipede/remote_file.h"
#include "./centipede/seed_corpus_config.pb.h"
#include "./centipede/shard_reader.h"
#include "./centipede/util.h"
#include "./centipede/workdir.h"
#include "google/protobuf/text_format.h"

// TODO(ussuri): Add unit tests.
Expand All @@ -59,6 +61,9 @@ namespace centipede {

namespace fs = std::filesystem;

using InputAndFeatures = std::pair<ByteArray, FeatureVec>;
using InputAndFeaturesVec = std::vector<InputAndFeatures>;

SeedCorpusConfig ResolveSeedCorpusConfig( //
std::string_view config_spec, //
std::string_view override_out_dir) {
Expand Down Expand Up @@ -115,9 +120,11 @@ SeedCorpusConfig ResolveSeedCorpusConfig( //
return config;
}

void SampleSeedCorpusElementsFromSource( //
const SeedCorpusSource& source, //
std::vector<centipede::ByteArray>& elements) {
void SampleSeedCorpusElementsFromSource( //
const SeedCorpusSource& source, //
std::string_view coverage_binary_name, //
std::string_view coverage_binary_hash, //
InputAndFeaturesVec& elements) {
LOG(INFO) << "Reading/sampling seed corpus elements from source:\n"
<< source.DebugString();

Expand Down Expand Up @@ -157,35 +164,39 @@ void SampleSeedCorpusElementsFromSource( //

// Read all the elements from the found corpus shard files.

std::vector<centipede::ByteArray> src_elts;
InputAndFeaturesVec src_elts;
size_t num_non_empty_features = 0;

for (const auto& corpus_fname : corpus_fnames) {
std::unique_ptr<centipede::BlobFileReader> corpus_reader =
centipede::DefaultBlobFileReaderFactory();
CHECK(corpus_reader != nullptr);
CHECK_OK(corpus_reader->Open(corpus_fname)) << VV(corpus_fname);

absl::Status read_status;
size_t num_read_elts = 0;
while (true) {
absl::Span<uint8_t> elt;
read_status = corpus_reader->Read(elt);
// Reached EOF - done with this shard.
if (absl::IsOutOfRange(read_status)) break;
CHECK_OK(read_status)
<< "Failure reading elements from shard " << corpus_fname;
CHECK(!elt.empty()) << "Read empty element: " << VV(corpus_fname);
src_elts.emplace_back(elt.begin(), elt.end());
++num_read_elts;
}

corpus_reader->Close().IgnoreError();

LOG(INFO) << "Read " << num_read_elts << " elements from shard "
<< corpus_fname;
// NOTE: The deduced matching `features_fname` may not exist if the source
// corpus was generated for a coverage binary that is different from the one
// we need, but `ReadShard()` can tolerate that, passing empty `FeatureVec`s
// to the callback if that's the case.
const auto work_dir = WorkDir::FromCorpusShardPath( //
corpus_fname, coverage_binary_name, coverage_binary_hash);
const std::string features_fname =
work_dir.CorpusFiles().IsShardPath(corpus_fname)
? work_dir.FeaturesFiles().MyShardPath()
: work_dir.DistilledCorpusFiles().IsShardPath(corpus_fname)
? work_dir.DistilledFeaturesFiles().MyShardPath()
: "";
size_t prev_src_elts_size = src_elts.size();
size_t prev_num_non_empty_features = num_non_empty_features;
ReadShard(corpus_fname, features_fname,
[&src_elts, &num_non_empty_features](const ByteArray& input,
FeatureVec& features) {
num_non_empty_features += features.empty() ? 0 : 1;
src_elts.emplace_back(input, std::move(features));
});
LOG(INFO) << "Read " << (src_elts.size() - prev_src_elts_size)
<< " elements with "
<< (num_non_empty_features - prev_num_non_empty_features)
<< " non-empty features from source shard:\n"
<< VV(corpus_fname) << "\n"
<< VV(features_fname);
}

LOG(INFO) << "Read " << src_elts.size() << " elements total from source "
LOG(INFO) << "Read total of " << src_elts.size() << " elements with "
<< num_non_empty_features << " non-empty features from source "
<< source.dir_glob();

// Extract a sample of the elements of the size specified in
Expand Down Expand Up @@ -219,8 +230,10 @@ void SampleSeedCorpusElementsFromSource( //
}
}

void WriteSeedCorpusElementsToDestination( //
const std::vector<centipede::ByteArray>& elements, //
void WriteSeedCorpusElementsToDestination( //
const InputAndFeaturesVec& elements, //
std::string_view coverage_binary_name, //
std::string_view coverage_binary_hash, //
const SeedCorpusDestination& destination) {
LOG(INFO) << "Writing seed corpus elements to destination:\n"
<< destination.DebugString();
Expand Down Expand Up @@ -250,34 +263,89 @@ void WriteSeedCorpusElementsToDestination( //
auto elt_it = elements.cbegin();
for (size_t s = 0; s < shard_sizes.size(); ++s) {
// Generate the output shard's filename.
// TODO(ussuri): Use more of `WorkDir` APIs here (possibly extend them,
// and possibly retire `SeedCorpusDestination::shard_index_digits`).
const std::string shard_idx =
absl::StrFormat("%0*d", shard_index_digits, s);
const std::string shard_rel_fname =
const std::string corpus_rel_fname =
absl::StrReplaceAll(destination.shard_rel_glob(), {{"*", shard_idx}});
const std::string shard_fname =
fs::path{destination.dir_path()} / shard_rel_fname;
const std::string corpus_fname =
fs::path{destination.dir_path()} / corpus_rel_fname;

const auto work_dir = WorkDir::FromCorpusShardPath( //
corpus_fname, coverage_binary_name, coverage_binary_hash);

CHECK(corpus_fname == work_dir.CorpusFiles().MyShardPath() ||
corpus_fname == work_dir.DistilledCorpusFiles().MyShardPath())
<< "Bad config: generated destination corpus filename '" << corpus_fname
<< "' doesn't match one of two expected forms '"
<< work_dir.CorpusFiles().MyShardPath() << "' or '"
<< work_dir.DistilledCorpusFiles().MyShardPath()
<< "'; make sure binary name in config matches explicitly passed '"
<< coverage_binary_name << "'";

const std::string features_fname =
work_dir.CorpusFiles().IsShardPath(corpus_fname)
? work_dir.FeaturesFiles().MyShardPath()
: work_dir.DistilledFeaturesFiles().MyShardPath();
CHECK(!features_fname.empty());

LOG(INFO) << "Writing " << shard_sizes[s]
<< " elements to destination shard:\n"
<< VV(corpus_fname) << "\n"
<< VV(features_fname);

// Features files are always saved in a subdir of the workdir
// (== `destination.dir_path()` here), which might not exist yet, so we
// create it. Corpus files are saved in the workdir directly, but we also
// create it in case `destination.shard_rel_glob()` contains some dirs
// (not really intended for that, but the end-user may do that).
if (!corpus_fname.empty()) {
RemoteMkdir(fs::path{corpus_fname}.parent_path().string());
}
if (!features_fname.empty()) {
RemoteMkdir(fs::path{features_fname}.parent_path().string());
}

// Create writers for the corpus and features shard files.

LOG(INFO) << "Writing " << shard_sizes[s] << " elements to " << shard_fname;
// TODO(ussuri): 1. Once the whole thing is a class, make
// `num_non_empty_features` a member and don't even create a features file
// if 0. 2. Wrap corpus/features writing in a similar API to `ReadShard()`.

// Open the shard's file.
std::unique_ptr<centipede::BlobFileWriter> corpus_writer =
const std::unique_ptr<centipede::BlobFileWriter> corpus_writer =
centipede::DefaultBlobFileWriterFactory();
CHECK(corpus_writer != nullptr);
CHECK_OK(corpus_writer->Open(shard_fname, "w")) << VV(shard_fname);
CHECK_OK(corpus_writer->Open(corpus_fname, "w")) << VV(corpus_fname);

const std::unique_ptr<centipede::BlobFileWriter> features_writer =
DefaultBlobFileWriterFactory();
CHECK(features_writer != nullptr);
CHECK_OK(features_writer->Open(features_fname, "w")) << VV(features_fname);

// Write the shard's elements to the corpus and features shard files.

// Write the shard's elements to the file.
for (size_t e = 0, ee = shard_sizes[s]; e < ee; ++e) {
CHECK(elt_it != elements.cend());
CHECK_OK(corpus_writer->Write(*elt_it)) << VV(shard_fname);
const ByteArray& input = elt_it->first;
CHECK_OK(corpus_writer->Write(input)) << VV(corpus_fname);
const FeatureVec& features = elt_it->second;
if (!features.empty()) {
const ByteArray packed_features = PackFeaturesAndHash(input, features);
CHECK_OK(features_writer->Write(packed_features)) << VV(features_fname);
}
++elt_it;
}

CHECK_OK(corpus_writer->Close()) << VV(shard_fname);
CHECK_OK(corpus_writer->Close()) << VV(corpus_fname);
CHECK_OK(features_writer->Close()) << VV(features_fname);
}
}

void GenerateSeedCorpusFromConfig( //
std::string_view config_spec, //
void GenerateSeedCorpusFromConfig( //
std::string_view config_spec, //
std::string_view coverage_binary_name, //
std::string_view coverage_binary_hash, //
std::string_view override_out_dir) {
const SeedCorpusConfig config =
ResolveSeedCorpusConfig(config_spec, override_out_dir);
Expand All @@ -290,14 +358,18 @@ void GenerateSeedCorpusFromConfig( //
// Pre-create the destination dir early to catch possible misspellings etc.
RemoteMkdir(config.destination().dir_path());

std::vector<centipede::ByteArray> elements;
InputAndFeaturesVec elements;

for (const auto& source : config.sources()) {
SampleSeedCorpusElementsFromSource(source, elements);
SampleSeedCorpusElementsFromSource( //
source, coverage_binary_name, coverage_binary_hash, elements);
}
LOG(INFO) << "Sampled " << elements.size() << " elements from "
<< config.sources_size() << " seed corpus source(s)";

WriteSeedCorpusElementsToDestination(elements, config.destination());
WriteSeedCorpusElementsToDestination( //
elements, coverage_binary_name, coverage_binary_hash,
config.destination());
LOG(INFO) << "Wrote " << elements.size()
<< " elements to seed corpus destination";
}
Expand Down
Loading

0 comments on commit 73ea91b

Please sign in to comment.