diff --git a/centipede/BUILD b/centipede/BUILD index 4fbd803d..9896e712 100644 --- a/centipede/BUILD +++ b/centipede/BUILD @@ -212,6 +212,7 @@ cc_library( ":defs", ":feature", ":logging", + ":remote_file", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/log:check", "@com_google_absl//absl/strings", diff --git a/centipede/centipede.cc b/centipede/centipede.cc index aa71120c..41186992 100644 --- a/centipede/centipede.cc +++ b/centipede/centipede.cc @@ -48,7 +48,7 @@ #include #include #include -#include // NOLINT +#include #include #include #include @@ -129,9 +129,8 @@ Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks, input_filter_cmd_.StartForkServer(TemporaryLocalDirPath(), "input_filter"); } -void Centipede::SaveCorpusToLocalDir( - const Environment &env, std::string_view save_corpus_to_local_dir) { - const WorkDir wd{env}; +void Centipede::CorpusToFiles(const Environment &env, std::string_view dir) { + WorkDir wd{env}; for (size_t shard = 0; shard < env.total_shards; shard++) { auto reader = DefaultBlobFileReaderFactory(); auto corpus_path = wd.CorpusPath(shard); @@ -140,26 +139,23 @@ void Centipede::SaveCorpusToLocalDir( size_t num_read = 0; while (reader->Read(blob).ok()) { ++num_read; - WriteToLocalHashedFileInDir(save_corpus_to_local_dir, blob); + WriteToRemoteHashedFileInDir(dir, blob); } LOG(INFO) << "Read " << num_read << " from " << corpus_path; } } -void Centipede::ExportCorpusFromLocalDir(const Environment &env, - std::string_view local_dir) { - const WorkDir wd{env}; - // Shard the file paths in `local_dir` based on hashes of filenames. +void Centipede::CorpusFromFiles(const Environment &env, std::string_view dir) { + WorkDir wd{env}; + // Shard the file paths in `dir` based on hashes of filenames. // Such partition is stable: a given file always goes to a specific shard. std::vector> sharded_paths(env.total_shards); + std::vector paths; size_t total_paths = 0; - for (const auto &entry : - std::filesystem::recursive_directory_iterator(local_dir)) { - if (entry.is_regular_file()) { - size_t filename_hash = std::hash{}(entry.path().filename()); - sharded_paths[filename_hash % env.total_shards].push_back(entry.path()); - ++total_paths; - } + for (const std::string &path : RemoteListFilesRecursively(dir)) { + size_t filename_hash = std::hash{}(path); + sharded_paths[filename_hash % env.total_shards].push_back(path); + ++total_paths; } // Iterate over all shards. size_t inputs_added = 0; @@ -184,13 +180,13 @@ void Centipede::ExportCorpusFromLocalDir(const Environment &env, << "Failed to open corpus file: " << corpus_path; ByteArray shard_data; for (const auto &path : sharded_paths[shard]) { - ByteArray input; - ReadFromLocalFile(path, input); + std::string input; + RemoteFileGetContents(path, input); if (input.empty() || existing_hashes.contains(Hash(input))) { ++inputs_ignored; continue; } - CHECK_OK(appender->Write(input)); + CHECK_OK(appender->Write(ByteArray{input.begin(), input.end()})); ++inputs_added; } LOG(INFO) << VV(shard) << VV(inputs_added) << VV(inputs_ignored) diff --git a/centipede/centipede.h b/centipede/centipede.h index dfd02f69..63bbcd36 100644 --- a/centipede/centipede.h +++ b/centipede/centipede.h @@ -59,14 +59,12 @@ class Centipede { void FuzzingLoop(); // Saves the sharded corpus into `dir`, one file per input. - static void SaveCorpusToLocalDir(const Environment &env, - std::string_view dir); + static void CorpusToFiles(const Environment &env, std::string_view dir); // Exports the corpus from `dir` (one file per input) into the sharded corpus. // Reads `dir` recursively. // Ignores inputs that already exist in the shard they need to be added to. // Sharding is stable and depends only on env.total_shards and the file name. - static void ExportCorpusFromLocalDir(const Environment &env, - std::string_view dir); + static void CorpusFromFiles(const Environment &env, std::string_view dir); private: // Executes inputs from `input_vec`. diff --git a/centipede/centipede_interface.cc b/centipede/centipede_interface.cc index f187f572..236c2ff5 100644 --- a/centipede/centipede_interface.cc +++ b/centipede/centipede_interface.cc @@ -187,8 +187,8 @@ int CentipedeMain(const Environment &env, CentipedeCallbacksFactory &callbacks_factory) { SetSignalHandlers(env.stop_at); - if (!env.save_corpus_to_local_dir.empty()) { - Centipede::SaveCorpusToLocalDir(env, env.save_corpus_to_local_dir); + if (!env.corpus_to_files.empty()) { + Centipede::CorpusToFiles(env, env.corpus_to_files); return EXIT_SUCCESS; } @@ -201,15 +201,15 @@ int CentipedeMain(const Environment &env, } // Just export the corpus from a local dir and exit. - if (!env.export_corpus_from_local_dir.empty()) { - Centipede::ExportCorpusFromLocalDir(env, env.export_corpus_from_local_dir); + if (!env.corpus_from_files.empty()) { + Centipede::CorpusFromFiles(env, env.corpus_from_files); return EXIT_SUCCESS; } // Export the corpus from a local dir and then fuzz. if (!env.corpus_dir.empty()) { for (const auto &corpus_dir : env.corpus_dir) { - Centipede::ExportCorpusFromLocalDir(env, corpus_dir); + Centipede::CorpusFromFiles(env, corpus_dir); } } diff --git a/centipede/environment.h b/centipede/environment.h index 08569397..9d4d16d4 100644 --- a/centipede/environment.h +++ b/centipede/environment.h @@ -80,8 +80,8 @@ struct Environment { bool distill = false; size_t log_features_shards = 0; std::string knobs_file; - std::string save_corpus_to_local_dir; - std::string export_corpus_from_local_dir; + std::string corpus_to_files; + std::string corpus_from_files; std::vector corpus_dir; std::string symbolizer_path = "llvm-symbolizer"; std::string objdump_path = "objdump"; diff --git a/centipede/environment_flags.cc b/centipede/environment_flags.cc index 0a2d879d..212b2ea1 100644 --- a/centipede/environment_flags.cc +++ b/centipede/environment_flags.cc @@ -263,12 +263,10 @@ ABSL_FLAG(bool, print_runner_log, default_env->print_runner_log, ABSL_FLAG(std::string, knobs_file, default_env->knobs_file, "If not empty, knobs will be read from this (possibly remote) file." " The feature is experimental, not yet fully functional."); -ABSL_FLAG(std::string, save_corpus_to_local_dir, - default_env->save_corpus_to_local_dir, +ABSL_FLAG(std::string, corpus_to_files, default_env->corpus_to_files, "Save the remote corpus from working to the given directory, one " "file per corpus."); -ABSL_FLAG(std::string, export_corpus_from_local_dir, - default_env->export_corpus_from_local_dir, +ABSL_FLAG(std::string, corpus_from_files, default_env->corpus_from_files, "Export a corpus from a local directory with one file per input into " "the sharded remote corpus in workdir. Not recursive."); ABSL_FLAG(std::vector, corpus_dir, default_env->corpus_dir, @@ -470,9 +468,8 @@ Environment CreateEnvironmentFromFlags(const std::vector &argv) { .distill = absl::GetFlag(FLAGS_distill), .log_features_shards = absl::GetFlag(FLAGS_log_features_shards), .knobs_file = absl::GetFlag(FLAGS_knobs_file), - .save_corpus_to_local_dir = absl::GetFlag(FLAGS_save_corpus_to_local_dir), - .export_corpus_from_local_dir = - absl::GetFlag(FLAGS_export_corpus_from_local_dir), + .corpus_to_files = absl::GetFlag(FLAGS_corpus_to_files), + .corpus_from_files = absl::GetFlag(FLAGS_corpus_from_files), .corpus_dir = absl::GetFlag(FLAGS_corpus_dir), .symbolizer_path = absl::GetFlag(FLAGS_symbolizer_path), .objdump_path = absl::GetFlag(FLAGS_objdump_path), diff --git a/centipede/test_fuzzing_util.sh b/centipede/test_fuzzing_util.sh index d797992f..069ec22e 100644 --- a/centipede/test_fuzzing_util.sh +++ b/centipede/test_fuzzing_util.sh @@ -73,7 +73,7 @@ function centipede::test_crashing_target() { # Create a corpus with one crasher and one other input. echo -n "${crash_input}" >"${TMPCORPUS}/${crash_input}" # induces abort in the target. echo -n "${nice_input}" >"${TMPCORPUS}/${nice_input}" # just some input. - ${target} --workdir="${WD}" --export_corpus_from_local_dir="${TMPCORPUS}" + ${target} --workdir="${WD}" --corpus_from_files="${TMPCORPUS}" # Run fuzzing with num_runs=0, i.e. only run the inputs from the corpus. # Expecting a crash to be observed and reported. diff --git a/centipede/testing/centipede_main_test.sh b/centipede/testing/centipede_main_test.sh index 42013c98..bd70d95c 100755 --- a/centipede/testing/centipede_main_test.sh +++ b/centipede/testing/centipede_main_test.sh @@ -75,7 +75,7 @@ test_debug_symbols() { centipede::assert_regex_in_file "EDGE: LLVMFuzzerTestOneInput .*testing/test_fuzz_target.cc" "${LOG}" echo "============ ${FUNC}: add func1/func2-A inputs to the corpus." - test_fuzz --workdir="${WD}" --export_corpus_from_local_dir="${TMPCORPUS}" + test_fuzz --workdir="${WD}" --corpus_from_files="${TMPCORPUS}" echo "============ ${FUNC}: run again, append to the same LOG file." # TODO(b/282845630): Passing `--num_runs=1` only to trigger telemetry dumping. @@ -129,7 +129,7 @@ test_dictionary() { echo "foo" >"${TMPCORPUS}"/foo echo "bat" >"${TMPCORPUS}"/binary centipede::ensure_empty_dir "${WD}" - test_fuzz --workdir="${WD}" --export_corpus_from_local_dir "${TMPCORPUS}" + test_fuzz --workdir="${WD}" --corpus_from_files "${TMPCORPUS}" cp "${WD}/corpus.000000" "${DICT}" echo "============ ${FUNC}: testing binary dictionary file" @@ -150,7 +150,7 @@ test_for_each_blob() { echo "FoO" >"${TMPCORPUS}"/a echo "bAr" >"${TMPCORPUS}"/b - test_fuzz --workdir="${WD}" --export_corpus_from_local_dir "${TMPCORPUS}" + test_fuzz --workdir="${WD}" --corpus_from_files "${TMPCORPUS}" echo "============ ${FUNC}: test for_each_blob" test_fuzz --for_each_blob="cat %P" "${WD}"/corpus.000000 | tee "${LOG}" centipede::assert_regex_in_file "Running 'cat %P' on ${WD}/corpus.000000" "${LOG}" diff --git a/centipede/testing/clusterfuzz_format_test.sh b/centipede/testing/clusterfuzz_format_test.sh index 00d970cf..a5ae35c0 100755 --- a/centipede/testing/clusterfuzz_format_test.sh +++ b/centipede/testing/clusterfuzz_format_test.sh @@ -66,7 +66,7 @@ test_crashing_target() { # Create a corpus with one crasher and one other input. cp "$1" "${TMPCORPUS}" # Triggers an error. echo -n "foo" >"${TMPCORPUS}/foo" # Just some input. - abort_test_fuzz --export_corpus_from_local_dir="${TMPCORPUS}" + abort_test_fuzz --corpus_from_files="${TMPCORPUS}" # Run fuzzing with num_runs=0, i.e. only run the inputs from the corpus. # Expecting a crash to be observed and reported. diff --git a/centipede/util.cc b/centipede/util.cc index 46d0405d..82461519 100644 --- a/centipede/util.cc +++ b/centipede/util.cc @@ -56,6 +56,7 @@ #include "./centipede/defs.h" #include "./centipede/feature.h" #include "./centipede/logging.h" +#include "./centipede/remote_file.h" namespace centipede { @@ -142,6 +143,13 @@ void WriteToLocalHashedFileInDir(std::string_view dir_path, WriteToLocalFile(file_path, data); } +void WriteToRemoteHashedFileInDir(std::string_view dir_path, + absl::Span data) { + if (dir_path.empty()) return; + std::string file_path = std::filesystem::path(dir_path).append(Hash(data)); + RemoteFileSetContents(file_path, std::string(data.begin(), data.end())); +} + std::string HashOfFileContents(std::string_view file_path) { ByteArray ba; ReadFromLocalFile(file_path, ba); diff --git a/centipede/util.h b/centipede/util.h index c69a1d6c..69b05865 100644 --- a/centipede/util.h +++ b/centipede/util.h @@ -60,6 +60,9 @@ void WriteToLocalFile(std::string_view file_path, const FeatureVec &data); // Writes `data` to `dir_path`/Hash(`data`). Does nothing if `dir_path.empty()`. void WriteToLocalHashedFileInDir(std::string_view dir_path, absl::Span data); +// Same as `WriteToLocalHashedFileInDir` except supports remote files. +void WriteToRemoteHashedFileInDir(std::string_view dir_path, + absl::Span data); // Returns a path string suitable to create a temporary local directory. // Will return the same value every time it is called within one thread, // but different values for different threads and difference processes.