Skip to content

Commit

Permalink
#Centipede More RAM-efficient ReadShard(), p.1
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 615963740
  • Loading branch information
ussuri authored and copybara-github committed Mar 15, 2024
1 parent 2e64f5d commit 3d43bec
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 18 deletions.
4 changes: 2 additions & 2 deletions centipede/analyze_corpora.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ std::vector<CorpusRecord> ReadCorpora(std::string_view binary_name,
LOG(INFO) << "Reading corpus at: " << corpus_paths[i];
LOG(INFO) << "Reading features at: " << features_paths[i];
ReadShard(corpus_paths[i], features_paths[i],
[&corpus](const ByteArray &input, FeatureVec &features) {
corpus.push_back({input, features});
[&corpus](ByteArray input, FeatureVec features) {
corpus.emplace_back(std::move(input), std::move(features));
});
}
return corpus;
Expand Down
7 changes: 4 additions & 3 deletions centipede/centipede.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
#include <sstream>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "absl/base/attributes.h"
Expand Down Expand Up @@ -415,12 +416,12 @@ void Centipede::LoadShard(const Environment &load_env, size_t shard_index,
size_t num_added_inputs = 0;
size_t num_skipped_inputs = 0;
std::vector<ByteArray> inputs_to_rerun;
auto input_features_callback = [&](const ByteArray &input,
FeatureVec &input_features) {
auto input_features_callback = [&](ByteArray input,
FeatureVec input_features) {
if (EarlyExitRequested()) return;
if (input_features.empty()) {
if (rerun) {
inputs_to_rerun.push_back(input);
inputs_to_rerun.emplace_back(std::move(input));
}
} else {
LogFeaturesAsSymbols(input_features);
Expand Down
4 changes: 2 additions & 2 deletions centipede/distill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@ class InputCorpusShardReader {
// Read elements from the current shard.
centipede::ReadShard( //
corpus_path, features_path,
[&elts](const ByteArray &input, FeatureVec &features) {
elts.emplace_back(input, std::move(features));
[&elts](ByteArray input, FeatureVec features) {
elts.emplace_back(std::move(input), std::move(features));
});
return elts;
}
Expand Down
6 changes: 3 additions & 3 deletions centipede/distill_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <filesystem> // NOLINT
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "gmock/gmock.h"
Expand Down Expand Up @@ -86,9 +87,8 @@ std::vector<TestCorpusRecord> ReadFromDistilled(const WorkDir &wd) {
wd.DistilledFeaturesFiles().MyShardPath();

std::vector<TestCorpusRecord> result;
auto shard_reader_callback = [&result](const ByteArray &input,
FeatureVec &features) {
result.push_back({input, features});
auto shard_reader_callback = [&result](ByteArray input, FeatureVec features) {
result.emplace_back(std::move(input), std::move(features));
};
ReadShard(distilled_corpus_path, distilled_features_path,
shard_reader_callback);
Expand Down
2 changes: 1 addition & 1 deletion centipede/seed_corpus_maker_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ void SampleSeedCorpusElementsFromSource( //

ReadShard(corpus_fname, features_fname,
[shard, &shard_elts, &shard_elts_with_features]( //
const ByteArray& input, FeatureVec& features) {
ByteArray input, FeatureVec features) {
// `ReadShard()` indicates "features not computed/found" as
// `{}` and "features computed/found, but empty" as
// `{feature_domains::kNoFeature}`. We're interested in how
Expand Down
7 changes: 3 additions & 4 deletions centipede/shard_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@ namespace centipede {
// params when calling it.
// * When the above is done, stop inserting empty `FeatureVec`s into
// `hash_to_features` when invoking the callback, just pass {}.
void ReadShard(
std::string_view corpus_path, std::string_view features_path,
const std::function<void(const ByteArray &, FeatureVec &)> &callback) {
void ReadShard(std::string_view corpus_path, std::string_view features_path,
const std::function<void(ByteArray, FeatureVec)> &callback) {
const bool good_corpus_path =
!corpus_path.empty() && RemotePathExists(corpus_path);
const bool good_features_path =
Expand Down Expand Up @@ -95,7 +94,7 @@ void ReadShard(
// a truly empty value into `hash_to_features`, allowing the client to
// discern these two cases.
FeatureVec &features = hash_to_features[Hash(blob)];
callback(input, features);
callback(std::move(input), std::move(features));
}
}

Expand Down
5 changes: 2 additions & 3 deletions centipede/shard_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ namespace centipede {
//
// If features are found for a given input but are empty,
// then callback's 2nd argument is {feature_domains::kNoFeature}.
void ReadShard(
std::string_view corpus_path, std::string_view features_path,
const std::function<void(const ByteArray &, FeatureVec &)> &callback);
void ReadShard(std::string_view corpus_path, std::string_view features_path,
const std::function<void(ByteArray, FeatureVec)> &callback);

} // namespace centipede

Expand Down

0 comments on commit 3d43bec

Please sign in to comment.