From 7d8f28106c8589681982f084c12b282a9d30e334 Mon Sep 17 00:00:00 2001 From: Sergey Shevchenko Date: Thu, 11 Jan 2024 15:41:00 -0800 Subject: [PATCH] #Centipede Distiller: NFC renaming + additional logging PiperOrigin-RevId: 597672717 --- centipede/BUILD | 1 - centipede/distill.cc | 48 ++++++++++++++++++++----------- centipede/distill_test.cc | 5 ---- centipede/seed_corpus_maker_lib.h | 4 +-- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/centipede/BUILD b/centipede/BUILD index 9c95b3923..6fcf3f0db 100644 --- a/centipede/BUILD +++ b/centipede/BUILD @@ -1293,7 +1293,6 @@ cc_test( ":test_util", ":util", ":workdir", - "@com_google_absl//absl/flags:flag", "@com_google_absl//absl/flags:reflection", "@com_google_absl//absl/log:check", "@com_google_googletest//:gtest_main", diff --git a/centipede/distill.cc b/centipede/distill.cc index 3d8655e3a..faea09791 100644 --- a/centipede/distill.cc +++ b/centipede/distill.cc @@ -38,10 +38,15 @@ namespace centipede { +using CorpusElt = std::pair; +using CorpusEltVec = std::vector; + void DistillTask(const Environment &env, const std::vector &shard_indices) { + const std::string log_line = + absl::StrCat("DISTILL[S.", env.my_shard_index, "]: "); + const WorkDir wd{env}; - std::string log_line = absl::StrCat("DISTILL[S.", env.my_shard_index, "]: "); const auto corpus_path = wd.DistilledCorpusFiles().MyShardPath(); const auto features_path = wd.DistilledFeaturesFiles().MyShardPath(); LOG(INFO) << log_line << VV(env.total_shards) << VV(corpus_path) @@ -56,22 +61,26 @@ void DistillTask(const Environment &env, FeatureSet feature_set(/*frequency_threshold=*/1, env.MakeDomainDiscardMask()); - const size_t num_total_shards = shard_indices.size(); - size_t num_shards_read = 0; - size_t num_distilled_corpus_elements = 0; + const size_t num_shards = shard_indices.size(); + size_t num_read_shards = 0; + size_t num_read_elements = 0; + size_t num_distilled_elements = 0; const auto corpus_files = wd.CorpusFiles(); const auto features_files = wd.FeaturesFiles(); + for (size_t shard_idx : shard_indices) { const std::string corpus_path = corpus_files.ShardPath(shard_idx); const std::string features_path = features_files.ShardPath(shard_idx); - VLOG(2) << log_line << "reading shard " << shard_idx << " from:\n" + + VLOG(2) << log_line << "reading input shard " << shard_idx << ":\n" << VV(corpus_path) << "\n" << VV(features_path); - // Read records from the current shard. - std::vector> records; + + // Read elements from the current shard. + CorpusEltVec shard_elts; ReadShard(corpus_path, features_path, - [&](const ByteArray &input, FeatureVec &input_features) { - records.emplace_back(input, std::move(input_features)); + [&shard_elts](const ByteArray &input, FeatureVec &features) { + shard_elts.emplace_back(input, std::move(features)); }); // Reverse the order of inputs read from the current shard. // The intuition is as follows: @@ -79,22 +88,27 @@ void DistillTask(const Environment &env, // are closer to the end are more interesting, so we start there. // * If the shard resulted from somethening else, the reverse order is not // any better or worse than any other order. - std::reverse(records.begin(), records.end()); - // Iterate the records, add those that have new features. + std::reverse(shard_elts.begin(), shard_elts.end()); + ++num_read_shards; + + // Iterate the elts, add those that have new features. // This is a simple linear greedy set cover algorithm. - for (auto &&[input, features] : records) { + VLOG(1) << log_line << "appending elements from input shard " << shard_idx + << " to output shard"; + for (auto &[input, features] : shard_elts) { + ++num_read_elements; feature_set.PruneDiscardedDomains(features); if (!feature_set.HasUnseenFeatures(features)) continue; feature_set.IncrementFrequencies(features); // Append to the distilled corpus and features files. CHECK_OK(corpus_writer->Write(input)); CHECK_OK(features_writer->Write(PackFeaturesAndHash(input, features))); - num_distilled_corpus_elements++; + ++num_distilled_elements; + VLOG_EVERY_N(10, 1000) << VV(num_distilled_elements); } - num_shards_read++; - LOG(INFO) << log_line << feature_set << " shards: " << num_shards_read - << "/" << num_total_shards - << " corpus: " << num_distilled_corpus_elements; + LOG(INFO) << log_line << feature_set << " src_shards: " << num_read_shards + << "/" << num_shards << " src_elts: " << num_read_elements + << " dist_elts: " << num_distilled_elements; } } diff --git a/centipede/distill_test.cc b/centipede/distill_test.cc index 480071cbb..9c0138c1a 100644 --- a/centipede/distill_test.cc +++ b/centipede/distill_test.cc @@ -23,7 +23,6 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "absl/flags/declare.h" #include "absl/flags/reflection.h" #include "absl/log/check.h" #include "./centipede/blob_file.h" @@ -35,10 +34,6 @@ #include "./centipede/util.h" #include "./centipede/workdir.h" -ABSL_DECLARE_FLAG(std::string, binary_hash); -ABSL_DECLARE_FLAG(std::string, binary); -ABSL_DECLARE_FLAG(std::string, workdir); - namespace centipede { namespace { diff --git a/centipede/seed_corpus_maker_lib.h b/centipede/seed_corpus_maker_lib.h index 357311695..7f6457d9e 100644 --- a/centipede/seed_corpus_maker_lib.h +++ b/centipede/seed_corpus_maker_lib.h @@ -25,8 +25,8 @@ namespace centipede { -using InputAndFeatures = std::pair; -using InputAndFeaturesVec = std::vector; +using CorpusElt = std::pair; +using InputAndFeaturesVec = std::vector; // If a file with `config_spec` path exists, tries to parse it as a // `SeedCorpusConfig` textproto. Otherwise, tries to parse `config_spec` as a