From b6dbff09fb7d5a2abbedc07540cf66eb826d181b Mon Sep 17 00:00:00 2001 From: Markus Kusano Date: Mon, 30 Oct 2023 13:10:51 -0700 Subject: [PATCH] #Centipede Update ExportCorpusFromLocalDir and SaveCorpusToLocalDir to support remote files. The names no longer make sense since the directories are no longer local. These functions are also associated with similarly named flags. I've renamed everything for consistency. PiperOrigin-RevId: 577933102 --- centipede/BUILD | 1 + centipede/centipede.cc | 34 +++++++++----------- centipede/centipede.h | 6 ++-- centipede/centipede_interface.cc | 10 +++--- centipede/environment.h | 4 +-- centipede/environment_flags.cc | 11 +++---- centipede/test_fuzzing_util.sh | 2 +- centipede/testing/centipede_main_test.sh | 6 ++-- centipede/testing/clusterfuzz_format_test.sh | 2 +- centipede/util.cc | 8 +++++ centipede/util.h | 3 ++ 11 files changed, 45 insertions(+), 42 deletions(-) diff --git a/centipede/BUILD b/centipede/BUILD index 4fbd803d..9896e712 100644 --- a/centipede/BUILD +++ b/centipede/BUILD @@ -212,6 +212,7 @@ cc_library( ":defs", ":feature", ":logging", + ":remote_file", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/log:check", "@com_google_absl//absl/strings", diff --git a/centipede/centipede.cc b/centipede/centipede.cc index aa71120c..41186992 100644 --- a/centipede/centipede.cc +++ b/centipede/centipede.cc @@ -48,7 +48,7 @@ #include #include #include -#include // NOLINT +#include #include #include #include @@ -129,9 +129,8 @@ Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks, input_filter_cmd_.StartForkServer(TemporaryLocalDirPath(), "input_filter"); } -void Centipede::SaveCorpusToLocalDir( - const Environment &env, std::string_view save_corpus_to_local_dir) { - const WorkDir wd{env}; +void Centipede::CorpusToFiles(const Environment &env, std::string_view dir) { + WorkDir wd{env}; for (size_t shard = 0; shard < env.total_shards; shard++) { auto reader = DefaultBlobFileReaderFactory(); auto corpus_path = wd.CorpusPath(shard); @@ -140,26 +139,23 @@ void Centipede::SaveCorpusToLocalDir( size_t num_read = 0; while (reader->Read(blob).ok()) { ++num_read; - WriteToLocalHashedFileInDir(save_corpus_to_local_dir, blob); + WriteToRemoteHashedFileInDir(dir, blob); } LOG(INFO) << "Read " << num_read << " from " << corpus_path; } } -void Centipede::ExportCorpusFromLocalDir(const Environment &env, - std::string_view local_dir) { - const WorkDir wd{env}; - // Shard the file paths in `local_dir` based on hashes of filenames. +void Centipede::CorpusFromFiles(const Environment &env, std::string_view dir) { + WorkDir wd{env}; + // Shard the file paths in `dir` based on hashes of filenames. // Such partition is stable: a given file always goes to a specific shard. std::vector> sharded_paths(env.total_shards); + std::vector paths; size_t total_paths = 0; - for (const auto &entry : - std::filesystem::recursive_directory_iterator(local_dir)) { - if (entry.is_regular_file()) { - size_t filename_hash = std::hash{}(entry.path().filename()); - sharded_paths[filename_hash % env.total_shards].push_back(entry.path()); - ++total_paths; - } + for (const std::string &path : RemoteListFilesRecursively(dir)) { + size_t filename_hash = std::hash{}(path); + sharded_paths[filename_hash % env.total_shards].push_back(path); + ++total_paths; } // Iterate over all shards. size_t inputs_added = 0; @@ -184,13 +180,13 @@ void Centipede::ExportCorpusFromLocalDir(const Environment &env, << "Failed to open corpus file: " << corpus_path; ByteArray shard_data; for (const auto &path : sharded_paths[shard]) { - ByteArray input; - ReadFromLocalFile(path, input); + std::string input; + RemoteFileGetContents(path, input); if (input.empty() || existing_hashes.contains(Hash(input))) { ++inputs_ignored; continue; } - CHECK_OK(appender->Write(input)); + CHECK_OK(appender->Write(ByteArray{input.begin(), input.end()})); ++inputs_added; } LOG(INFO) << VV(shard) << VV(inputs_added) << VV(inputs_ignored) diff --git a/centipede/centipede.h b/centipede/centipede.h index dfd02f69..63bbcd36 100644 --- a/centipede/centipede.h +++ b/centipede/centipede.h @@ -59,14 +59,12 @@ class Centipede { void FuzzingLoop(); // Saves the sharded corpus into `dir`, one file per input. - static void SaveCorpusToLocalDir(const Environment &env, - std::string_view dir); + static void CorpusToFiles(const Environment &env, std::string_view dir); // Exports the corpus from `dir` (one file per input) into the sharded corpus. // Reads `dir` recursively. // Ignores inputs that already exist in the shard they need to be added to. // Sharding is stable and depends only on env.total_shards and the file name. - static void ExportCorpusFromLocalDir(const Environment &env, - std::string_view dir); + static void CorpusFromFiles(const Environment &env, std::string_view dir); private: // Executes inputs from `input_vec`. diff --git a/centipede/centipede_interface.cc b/centipede/centipede_interface.cc index f187f572..236c2ff5 100644 --- a/centipede/centipede_interface.cc +++ b/centipede/centipede_interface.cc @@ -187,8 +187,8 @@ int CentipedeMain(const Environment &env, CentipedeCallbacksFactory &callbacks_factory) { SetSignalHandlers(env.stop_at); - if (!env.save_corpus_to_local_dir.empty()) { - Centipede::SaveCorpusToLocalDir(env, env.save_corpus_to_local_dir); + if (!env.corpus_to_files.empty()) { + Centipede::CorpusToFiles(env, env.corpus_to_files); return EXIT_SUCCESS; } @@ -201,15 +201,15 @@ int CentipedeMain(const Environment &env, } // Just export the corpus from a local dir and exit. - if (!env.export_corpus_from_local_dir.empty()) { - Centipede::ExportCorpusFromLocalDir(env, env.export_corpus_from_local_dir); + if (!env.corpus_from_files.empty()) { + Centipede::CorpusFromFiles(env, env.corpus_from_files); return EXIT_SUCCESS; } // Export the corpus from a local dir and then fuzz. if (!env.corpus_dir.empty()) { for (const auto &corpus_dir : env.corpus_dir) { - Centipede::ExportCorpusFromLocalDir(env, corpus_dir); + Centipede::CorpusFromFiles(env, corpus_dir); } } diff --git a/centipede/environment.h b/centipede/environment.h index 08569397..9d4d16d4 100644 --- a/centipede/environment.h +++ b/centipede/environment.h @@ -80,8 +80,8 @@ struct Environment { bool distill = false; size_t log_features_shards = 0; std::string knobs_file; - std::string save_corpus_to_local_dir; - std::string export_corpus_from_local_dir; + std::string corpus_to_files; + std::string corpus_from_files; std::vector corpus_dir; std::string symbolizer_path = "llvm-symbolizer"; std::string objdump_path = "objdump"; diff --git a/centipede/environment_flags.cc b/centipede/environment_flags.cc index 0a2d879d..212b2ea1 100644 --- a/centipede/environment_flags.cc +++ b/centipede/environment_flags.cc @@ -263,12 +263,10 @@ ABSL_FLAG(bool, print_runner_log, default_env->print_runner_log, ABSL_FLAG(std::string, knobs_file, default_env->knobs_file, "If not empty, knobs will be read from this (possibly remote) file." " The feature is experimental, not yet fully functional."); -ABSL_FLAG(std::string, save_corpus_to_local_dir, - default_env->save_corpus_to_local_dir, +ABSL_FLAG(std::string, corpus_to_files, default_env->corpus_to_files, "Save the remote corpus from working to the given directory, one " "file per corpus."); -ABSL_FLAG(std::string, export_corpus_from_local_dir, - default_env->export_corpus_from_local_dir, +ABSL_FLAG(std::string, corpus_from_files, default_env->corpus_from_files, "Export a corpus from a local directory with one file per input into " "the sharded remote corpus in workdir. Not recursive."); ABSL_FLAG(std::vector, corpus_dir, default_env->corpus_dir, @@ -470,9 +468,8 @@ Environment CreateEnvironmentFromFlags(const std::vector &argv) { .distill = absl::GetFlag(FLAGS_distill), .log_features_shards = absl::GetFlag(FLAGS_log_features_shards), .knobs_file = absl::GetFlag(FLAGS_knobs_file), - .save_corpus_to_local_dir = absl::GetFlag(FLAGS_save_corpus_to_local_dir), - .export_corpus_from_local_dir = - absl::GetFlag(FLAGS_export_corpus_from_local_dir), + .corpus_to_files = absl::GetFlag(FLAGS_corpus_to_files), + .corpus_from_files = absl::GetFlag(FLAGS_corpus_from_files), .corpus_dir = absl::GetFlag(FLAGS_corpus_dir), .symbolizer_path = absl::GetFlag(FLAGS_symbolizer_path), .objdump_path = absl::GetFlag(FLAGS_objdump_path), diff --git a/centipede/test_fuzzing_util.sh b/centipede/test_fuzzing_util.sh index d797992f..069ec22e 100644 --- a/centipede/test_fuzzing_util.sh +++ b/centipede/test_fuzzing_util.sh @@ -73,7 +73,7 @@ function centipede::test_crashing_target() { # Create a corpus with one crasher and one other input. echo -n "${crash_input}" >"${TMPCORPUS}/${crash_input}" # induces abort in the target. echo -n "${nice_input}" >"${TMPCORPUS}/${nice_input}" # just some input. - ${target} --workdir="${WD}" --export_corpus_from_local_dir="${TMPCORPUS}" + ${target} --workdir="${WD}" --corpus_from_files="${TMPCORPUS}" # Run fuzzing with num_runs=0, i.e. only run the inputs from the corpus. # Expecting a crash to be observed and reported. diff --git a/centipede/testing/centipede_main_test.sh b/centipede/testing/centipede_main_test.sh index 42013c98..bd70d95c 100755 --- a/centipede/testing/centipede_main_test.sh +++ b/centipede/testing/centipede_main_test.sh @@ -75,7 +75,7 @@ test_debug_symbols() { centipede::assert_regex_in_file "EDGE: LLVMFuzzerTestOneInput .*testing/test_fuzz_target.cc" "${LOG}" echo "============ ${FUNC}: add func1/func2-A inputs to the corpus." - test_fuzz --workdir="${WD}" --export_corpus_from_local_dir="${TMPCORPUS}" + test_fuzz --workdir="${WD}" --corpus_from_files="${TMPCORPUS}" echo "============ ${FUNC}: run again, append to the same LOG file." # TODO(b/282845630): Passing `--num_runs=1` only to trigger telemetry dumping. @@ -129,7 +129,7 @@ test_dictionary() { echo "foo" >"${TMPCORPUS}"/foo echo "bat" >"${TMPCORPUS}"/binary centipede::ensure_empty_dir "${WD}" - test_fuzz --workdir="${WD}" --export_corpus_from_local_dir "${TMPCORPUS}" + test_fuzz --workdir="${WD}" --corpus_from_files "${TMPCORPUS}" cp "${WD}/corpus.000000" "${DICT}" echo "============ ${FUNC}: testing binary dictionary file" @@ -150,7 +150,7 @@ test_for_each_blob() { echo "FoO" >"${TMPCORPUS}"/a echo "bAr" >"${TMPCORPUS}"/b - test_fuzz --workdir="${WD}" --export_corpus_from_local_dir "${TMPCORPUS}" + test_fuzz --workdir="${WD}" --corpus_from_files "${TMPCORPUS}" echo "============ ${FUNC}: test for_each_blob" test_fuzz --for_each_blob="cat %P" "${WD}"/corpus.000000 | tee "${LOG}" centipede::assert_regex_in_file "Running 'cat %P' on ${WD}/corpus.000000" "${LOG}" diff --git a/centipede/testing/clusterfuzz_format_test.sh b/centipede/testing/clusterfuzz_format_test.sh index 00d970cf..a5ae35c0 100755 --- a/centipede/testing/clusterfuzz_format_test.sh +++ b/centipede/testing/clusterfuzz_format_test.sh @@ -66,7 +66,7 @@ test_crashing_target() { # Create a corpus with one crasher and one other input. cp "$1" "${TMPCORPUS}" # Triggers an error. echo -n "foo" >"${TMPCORPUS}/foo" # Just some input. - abort_test_fuzz --export_corpus_from_local_dir="${TMPCORPUS}" + abort_test_fuzz --corpus_from_files="${TMPCORPUS}" # Run fuzzing with num_runs=0, i.e. only run the inputs from the corpus. # Expecting a crash to be observed and reported. diff --git a/centipede/util.cc b/centipede/util.cc index 46d0405d..82461519 100644 --- a/centipede/util.cc +++ b/centipede/util.cc @@ -56,6 +56,7 @@ #include "./centipede/defs.h" #include "./centipede/feature.h" #include "./centipede/logging.h" +#include "./centipede/remote_file.h" namespace centipede { @@ -142,6 +143,13 @@ void WriteToLocalHashedFileInDir(std::string_view dir_path, WriteToLocalFile(file_path, data); } +void WriteToRemoteHashedFileInDir(std::string_view dir_path, + absl::Span data) { + if (dir_path.empty()) return; + std::string file_path = std::filesystem::path(dir_path).append(Hash(data)); + RemoteFileSetContents(file_path, std::string(data.begin(), data.end())); +} + std::string HashOfFileContents(std::string_view file_path) { ByteArray ba; ReadFromLocalFile(file_path, ba); diff --git a/centipede/util.h b/centipede/util.h index c69a1d6c..69b05865 100644 --- a/centipede/util.h +++ b/centipede/util.h @@ -60,6 +60,9 @@ void WriteToLocalFile(std::string_view file_path, const FeatureVec &data); // Writes `data` to `dir_path`/Hash(`data`). Does nothing if `dir_path.empty()`. void WriteToLocalHashedFileInDir(std::string_view dir_path, absl::Span data); +// Same as `WriteToLocalHashedFileInDir` except supports remote files. +void WriteToRemoteHashedFileInDir(std::string_view dir_path, + absl::Span data); // Returns a path string suitable to create a temporary local directory. // Will return the same value every time it is called within one thread, // but different values for different threads and difference processes.