diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5006df2..19eee5a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,4 +20,4 @@ target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC "${PROJECT_NAME}_interface") add_executable ("${PROJECT_NAME}" main.cpp) target_link_libraries ("${PROJECT_NAME}" PRIVATE "${PROJECT_NAME}_lib") -add_subdirectory (tutorial) +add_subdirectory (tutorial_hibf) diff --git a/src/tutorial/CMakeLists.txt b/src/tutorial/CMakeLists.txt deleted file mode 100644 index 27bf787..0000000 --- a/src/tutorial/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -cmake_minimum_required (VERSION 3.16) - -add_library ("hibf_tutorial_lib" STATIC build.cpp count.cpp search.cpp) -target_link_libraries ("hibf_tutorial_lib" PUBLIC "${PROJECT_NAME}_interface") - -add_executable ("hibf_tutorial" main.cpp) -target_link_libraries ("hibf_tutorial" PRIVATE "hibf_tutorial_lib") diff --git a/src/tutorial/count.cpp b/src/tutorial/count.cpp deleted file mode 100644 index 67a3ad7..0000000 --- a/src/tutorial/count.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 - -#include -#include - -#include - -#include - -#include "index.hpp" -#include "sequence_io.hpp" -#include "validator.hpp" - -struct count_config -{ - std::filesystem::path index{}; - std::filesystem::path query{}; - std::filesystem::path output{}; - uint16_t threshold{1u}; -}; - -void count_index(count_config const & config) -{ - myindex index{}; - index.load(config.index); - - auto agent = index.hibf.counting_agent(); - - id_seq_reader fin{config.query}; - std::vector hashes{}; - std::ofstream fout{config.output}; - - for (size_t i = 0; i < index.input_files.size(); ++i) - fout << '#' << i << '\t' << index.input_files[i].c_str() << '\n'; - fout << "#QUERY_ID\tCOUNTS\n"; - - for (auto && [id, seq] : fin) - { - auto kmer_hash_view = seq | seqan3::views::kmer_hash(seqan3::ungapped{index.kmer}); - hashes.assign(kmer_hash_view.begin(), kmer_hash_view.end()); - - auto & result = agent.bulk_count(hashes, config.threshold); - - fout << id << '\t'; - for (auto && r : result) - fout << r << ' '; - fout << '\n'; - } -} - -void count(sharg::parser & parser) -{ - count_config config{}; - parser.add_option(config.index, - sharg::config{.short_id = 'i', - .long_id = "index", - .description = "Index", - .required = true, - .validator = sharg::input_file_validator{}}); - - parser.add_option(config.query, - sharg::config{.short_id = 'q', - .long_id = "query", - .description = "Query", - .required = true, - .validator = sharg::input_file_validator{{"fq", "fastq"}}}); - - parser.add_option(config.output, - sharg::config{.short_id = 'o', - .long_id = "output", - .description = "Output", - .required = true, - .validator = sharg::output_file_validator{}}); - - parser.add_option(config.threshold, - sharg::config{.short_id = '\0', - .long_id = "threshold", - .description = "Threshold.", - .validator = positive_integer_validator{}}); - - try - { - parser.parse(); - } - catch (sharg::parser_error const & ext) - { - std::cerr << "Parsing error. " << ext.what() << '\n'; - std::exit(EXIT_FAILURE); - } - - count_index(config); -} diff --git a/src/tutorial/search.cpp b/src/tutorial/search.cpp deleted file mode 100644 index fa59505..0000000 --- a/src/tutorial/search.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 - -#include -#include - -#include - -#include - -#include "index.hpp" -#include "sequence_io.hpp" - -struct search_config -{ - std::filesystem::path index{}; - std::filesystem::path query{}; - std::filesystem::path output{}; - double threshold{0.7}; -}; - -void search_index(search_config const & config) -{ - myindex index{}; - index.load(config.index); - - auto agent = index.hibf.membership_agent(); - - id_seq_reader fin{config.query}; - std::vector hashes{}; - std::ofstream fout{config.output}; - - for (size_t i = 0; i < index.input_files.size(); ++i) - fout << '#' << i << '\t' << index.input_files[i].c_str() << '\n'; - fout << "#QUERY_ID\tUSER_BINS\n"; - - for (auto && [id, seq] : fin) - { - auto kmer_hash_view = seq | seqan3::views::kmer_hash(seqan3::ungapped{index.kmer}); - hashes.assign(kmer_hash_view.begin(), kmer_hash_view.end()); - uint16_t const threshold = static_cast(config.threshold * hashes.size()); - - auto & result = agent.membership_for(hashes, threshold); - agent.sort_results(); // Optional - - fout << id << '\t'; - for (auto && r : result) - fout << r << ' '; - fout << '\n'; - } -} - -void search(sharg::parser & parser) -{ - search_config config{}; - parser.add_option(config.index, - sharg::config{.short_id = 'i', - .long_id = "index", - .description = "Index", - .required = true, - .validator = sharg::input_file_validator{}}); - - parser.add_option(config.query, - sharg::config{.short_id = 'q', - .long_id = "query", - .description = "Query", - .required = true, - .validator = sharg::input_file_validator{{"fq", "fastq"}}}); - - parser.add_option(config.output, - sharg::config{.short_id = 'o', - .long_id = "output", - .description = "Output", - .required = true, - .validator = sharg::output_file_validator{}}); - - parser.add_option(config.threshold, - sharg::config{.short_id = '\0', - .long_id = "threshold", - .description = "Threshold.", - .validator = sharg::arithmetic_range_validator{0.0, 1.0}}); - - try - { - parser.parse(); - } - catch (sharg::parser_error const & ext) - { - std::cerr << "Parsing error. " << ext.what() << '\n'; - std::exit(EXIT_FAILURE); - } - - search_index(config); -} diff --git a/src/tutorial_hibf/CMakeLists.txt b/src/tutorial_hibf/CMakeLists.txt new file mode 100644 index 0000000..9c9c9e4 --- /dev/null +++ b/src/tutorial_hibf/CMakeLists.txt @@ -0,0 +1,4 @@ +cmake_minimum_required (VERSION 3.16) + +add_executable ("hibf_tutorial" main.cpp build.cpp count.cpp search.cpp) +target_link_libraries ("hibf_tutorial" PRIVATE "${PROJECT_NAME}_interface") diff --git a/src/tutorial_hibf/README.md b/src/tutorial_hibf/README.md new file mode 100644 index 0000000..d33b590 --- /dev/null +++ b/src/tutorial_hibf/README.md @@ -0,0 +1,20 @@ +# Files for testing + +https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz + +``` +curl https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz -o tutorial_files.tar.gz +tar xf tutorial_files.tar.gz +``` + +It also contains a script that creates a file that can be used as input for build. +(Hint: BEFORE executing a script you downloaded from somewhere, you should read the file and check what it does.) +``` +./tutorial_files/get_filenames.sh +``` + +Input for build: `tutorial_files/filenames.txt` +Query for search/count: `tutorial_files/reads.fastq` + +`reads.fastq` contains 1024 queries. Each has a length of `250`. +The reads were simulated with `2` errors from their respective input file. diff --git a/src/tutorial/build.cpp b/src/tutorial_hibf/build.cpp similarity index 57% rename from src/tutorial/build.cpp rename to src/tutorial_hibf/build.cpp index 26547ff..38c03b4 100644 --- a/src/tutorial/build.cpp +++ b/src/tutorial_hibf/build.cpp @@ -13,36 +13,47 @@ #include "sequence_io.hpp" #include "validator.hpp" +//!\brief A struct that stores the configuration of the build command. struct build_config { + //!\brief The input file that contains the paths to the files that should be indexed. std::filesystem::path input{}; - std::vector input_files{}; + //!\brief The output file where the index should be stored. std::filesystem::path output{}; + //!\brief The paths to the files that should be indexed. + std::vector input_files{}; + //!\brief The kmer size that should be used for the index. uint8_t kmer{}; + //!\brief The number of threads that should be used for the index construction. uint8_t threads{1u}; }; +//!\brief Builds an index from the input files and stores it in the output file. void build_hibf(build_config & config) { + // This lambda function will be used for `.input_fn` of the config. auto input_lambda = [&config](size_t const user_bin_index, seqan::hibf::insert_iterator it) { - seq_reader fin{config.input_files[user_bin_index]}; - for (auto && [seq] : fin) - for (auto && hash : seq | seqan3::views::kmer_hash(seqan3::ungapped{config.kmer})) - it = hash; + // TODO: Remove the following two lines: + (void)user_bin_index; // Suppresses unused variable warning. + (void)it; // Suppresses unused variable warning. + + // TODO: Read the sequence from the `user_bin_index`-th input file and assign all k-mers to `it`. }; - seqan::hibf::config hibf_config{.input_fn = input_lambda, - .number_of_user_bins = config.input_files.size(), - .number_of_hash_functions = 2u, - .maximum_fpr = 0.05, - .threads = config.threads}; + // TODO: Remove the following line: + (void)input_lambda; // Suppresses unused variable warning. + + // TODO: Create a seqan::hibf::config. + + // TODO: Create a seqan::hibf::hierarchical_interleaved_bloom_filter. + + // TODO: Create a myindex. - seqan::hibf::hierarchical_interleaved_bloom_filter hibf{hibf_config}; - myindex index{config.kmer, std::move(config.input_files), std::move(hibf)}; - index.store(config.output); + // TODO: Store the myindex in the output file. } +//!\brief Reads the input files from the input file and stores them in the config. void read_input_files(build_config & config) { std::ifstream file{config.input}; @@ -57,34 +68,37 @@ void read_input_files(build_config & config) std::ranges::for_each(config.input_files, sharg::input_file_validator{{"fa", "fasta"}}); } +//!\brief Adds the options to the parser and calls the build function. void build(sharg::parser & parser) { build_config config{}; parser.add_option(config.input, sharg::config{.short_id = 'i', .long_id = "input", - .description = "Input", + .description = "Path to a file containing the paths to the files that should be " + "indexed. The file must contain one path per line.", .required = true, .validator = sharg::input_file_validator{}}); parser.add_option(config.output, sharg::config{.short_id = 'o', .long_id = "output", - .description = "Output", + .description = "Output path where the index should be stored.", .required = true, .validator = sharg::output_file_validator{}}); parser.add_option(config.kmer, sharg::config{.short_id = 'k', .long_id = "kmer", - .description = "Kmer", + .description = "The kmer size that should be used for the index.", .required = true, .validator = sharg::arithmetic_range_validator{1, 32}}); parser.add_option(config.threads, sharg::config{.short_id = 't', .long_id = "threads", - .description = "Threads.", + .description = "The number of threads that should be used for the index " + "construction.", .validator = positive_integer_validator{}}); try diff --git a/src/tutorial_hibf/count.cpp b/src/tutorial_hibf/count.cpp new file mode 100644 index 0000000..76c07ab --- /dev/null +++ b/src/tutorial_hibf/count.cpp @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#include +#include + +#include + +#include + +#include "index.hpp" +#include "sequence_io.hpp" +#include "validator.hpp" + +//!\brief A struct that stores the configuration of the count command. +struct count_config +{ + //!\brief The index that should be used for the counting. + std::filesystem::path index{}; + //!\brief The input file that contains the sequences that should be counted. + std::filesystem::path query{}; + //!\brief The output file where the counts should be stored. + std::filesystem::path output{}; + //!\brief If a count is below this threshold, it is not reported. + uint16_t threshold{1u}; +}; + +//!\brief Counts the k-mers in the query sequences using the given index and stores the counts in the output file. +void count_index(count_config const & config) +{ + // TODO: Remove the following line: + (void)config; // Suppresses unused variable warning. + + // TODO: Load the index from the given file. + + // TODO: + // * Create a counting_agent for the hibf. + // * Open the input file for reading IDs and sequences of the queries. + // * Open the output file for writing the results. + // * Create a std::vector that we use for storing the k-mer hashes. + + // TODO OPTIONAL: The counting_agent will a count for each user bin. + // It would be helpful to have some kind of header that tells us which user bin ID corresponds to which file. + // E.g.: + // #0 /path/to/user_bin_0.fasta + // #1 /path/to/user_bin_1.fasta + // etc. + + // TODO: + // * For each query sequence, calculate the k-mer hashes and determine the k-mer count in each user bin. + // * Write the results to the output file: "\t\n" + // HINT: + // * You can copy the elements of a view into a vector: `vector.assign(view.begin(), view.end())`. +} + +//!\brief Parses the command line arguments and counts the k-mers in the query sequences using the given index. +void count(sharg::parser & parser) +{ + count_config config{}; + parser.add_option(config.index, + sharg::config{.short_id = 'i', + .long_id = "index", + .description = "The index to count the k-mers in.", + .required = true, + .validator = sharg::input_file_validator{}}); + + parser.add_option(config.query, + sharg::config{.short_id = 'q', + .long_id = "query", + .description = "The input file that contains the sequences that should be counted.", + .required = true, + .validator = sharg::input_file_validator{{"fq", "fastq"}}}); + + parser.add_option(config.output, + sharg::config{.short_id = 'o', + .long_id = "output", + .description = "The output file where the counts should be stored.", + .required = true, + .validator = sharg::output_file_validator{}}); + + parser.add_option(config.threshold, + sharg::config{.short_id = '\0', + .long_id = "threshold", + .description = "Only report counts greather than or equal to this value.", + .validator = positive_integer_validator{}}); + + try + { + parser.parse(); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "Parsing error. " << ext.what() << '\n'; + std::exit(EXIT_FAILURE); + } + + count_index(config); +} diff --git a/src/tutorial/index.hpp b/src/tutorial_hibf/index.hpp similarity index 74% rename from src/tutorial/index.hpp rename to src/tutorial_hibf/index.hpp index 323cf59..3401d31 100644 --- a/src/tutorial/index.hpp +++ b/src/tutorial_hibf/index.hpp @@ -8,11 +8,15 @@ #include #include +//!\brief A class that stores the k-mer size, HIBF, the input file paths. class myindex { public: + //!\brief The k-mer size that was used for the index. uint8_t kmer{}; + //!\brief The input files that were used for the index. std::vector input_files{}; + //!\brief The Hierarchical Interleaved Bloom Filter that was constructed from the input files. seqan::hibf::hierarchical_interleaved_bloom_filter hibf{}; myindex() = default; @@ -22,6 +26,7 @@ class myindex myindex & operator=(myindex &&) = default; ~myindex() = default; + //!\brief Construct a myindex from the given k-mer size, input files, and HIBF. explicit myindex(uint8_t const kmer, std::vector input_files, seqan::hibf::hierarchical_interleaved_bloom_filter hibf) : @@ -30,6 +35,7 @@ class myindex hibf{std::move(hibf)} {} + //!\brief Store myindex in the given file. void store(std::filesystem::path const & path) const { std::ofstream fout{path}; @@ -37,6 +43,7 @@ class myindex oarchive(*this); } + //!\brief Load myindex from the given file. void load(std::filesystem::path const & path) { std::ifstream fin{path}; @@ -44,6 +51,7 @@ class myindex iarchive(*this); } + //!\brief Tells cereal how to serialize myindex. template void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) { diff --git a/src/tutorial/main.cpp b/src/tutorial_hibf/main.cpp similarity index 81% rename from src/tutorial/main.cpp rename to src/tutorial_hibf/main.cpp index 7a44007..b9db754 100644 --- a/src/tutorial/main.cpp +++ b/src/tutorial_hibf/main.cpp @@ -2,16 +2,17 @@ // SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik // SPDX-License-Identifier: CC0-1.0 -// https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz - #include +// These are forward declarations of the functions that will be called by the main function. +// They are defined in the files build.cpp, search.cpp, and count.cpp. void build(sharg::parser &); void search(sharg::parser &); void count(sharg::parser &); int main(int argc, char ** argv) { + // The main executable will have three subcommands: build, search, and count. sharg::parser parser{"hibf", {argv, argv + argc}, sharg::update_notifications::off, {"build", "search", "count"}}; try diff --git a/src/tutorial_hibf/search.cpp b/src/tutorial_hibf/search.cpp new file mode 100644 index 0000000..d30844c --- /dev/null +++ b/src/tutorial_hibf/search.cpp @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#include +#include + +#include + +#include + +#include "index.hpp" +#include "sequence_io.hpp" + +//!\brief A struct that stores the configuration of the search command. +struct search_config +{ + //!\brief The index that should be used for the search. + std::filesystem::path index{}; + //!\brief The input file that contains the sequences that should be searched. + std::filesystem::path query{}; + //!\brief The output file where the search results should be stored. + std::filesystem::path output{}; + //!\brief When a query sequence has more than this fraction of k-mers in a user bin, the user bin is reported. + double threshold{0.7}; +}; + +//!\brief Searches the k-mers in the query sequences using the given index and stores the results in the output file. +void search_index(search_config const & config) +{ + // TODO: Remove the following line: + (void)config; // Suppresses unused variable warning. + + // TODO: Load the index from the given file. + + // TODO: + // * Create a membership_agent for the hibf. + // * Open the input file for reading IDs and sequences of the queries. + // * Open the output file for writing the results. + // * Create a std::vector that we use for storing the k-mer hashes. + + // TODO OPTIONAL: The membership_agent will return the user bin IDs. + // It would be helpful to have some kind of header that tells us which user bin ID corresponds to which file. + // E.g.: + // #0 /path/to/user_bin_0.fasta + // #1 /path/to/user_bin_1.fasta + // etc. + + // TODO: + // * For each query sequence, calculate the k-mer hashes and determine in which user bins they occur. + // * Write the results to the output file: "\t\n" + // HINT: + // * You can copy the elements of a view into a vector: `vector.assign(view.begin(), view.end())`. + // OPTIONAL: + // * Sort the user bin IDs before writing them to the output file. +} + +//!\brief Parses the command line arguments and searches the k-mers in the query sequences using the given index. +void search(sharg::parser & parser) +{ + search_config config{}; + parser.add_option(config.index, + sharg::config{.short_id = 'i', + .long_id = "index", + .description = "The index to search the k-mers in.", + .required = true, + .validator = sharg::input_file_validator{}}); + + parser.add_option(config.query, + sharg::config{.short_id = 'q', + .long_id = "query", + .description = "The input file containing the sequences that should be searched.", + .required = true, + .validator = sharg::input_file_validator{{"fq", "fastq"}}}); + + parser.add_option(config.output, + sharg::config{.short_id = 'o', + .long_id = "output", + .description = "The output file where the search results should be stored.", + .required = true, + .validator = sharg::output_file_validator{}}); + + parser.add_option(config.threshold, + sharg::config{.short_id = '\0', + .long_id = "threshold", + .description = "Only report user bins where more than this fraction of all k-mers " + "in a query sequence are found.", + .validator = sharg::arithmetic_range_validator{0.0, 1.0}}); + + try + { + parser.parse(); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "Parsing error. " << ext.what() << '\n'; + std::exit(EXIT_FAILURE); + } + + search_index(config); +} diff --git a/src/tutorial/sequence_io.hpp b/src/tutorial_hibf/sequence_io.hpp similarity index 72% rename from src/tutorial/sequence_io.hpp rename to src/tutorial_hibf/sequence_io.hpp index 06729a5..26d8b8d 100644 --- a/src/tutorial/sequence_io.hpp +++ b/src/tutorial_hibf/sequence_io.hpp @@ -6,10 +6,13 @@ #include +//!\brief Tells seqan3::sequence_file_input to use dna4 as the sequence alphabet. struct dna4_traits : seqan3::sequence_file_input_default_traits_dna { using sequence_alphabet = seqan3::dna4; }; +//!\brief This seqan3::sequence_file_input will output only the sequence. using seq_reader = seqan3::sequence_file_input>; +//!\brief This seqan3::sequence_file_input will output id and sequence. using id_seq_reader = seqan3::sequence_file_input>; diff --git a/src/tutorial/validator.hpp b/src/tutorial_hibf/validator.hpp similarity index 85% rename from src/tutorial/validator.hpp rename to src/tutorial_hibf/validator.hpp index ccd486e..ac4e8df 100644 --- a/src/tutorial/validator.hpp +++ b/src/tutorial_hibf/validator.hpp @@ -6,8 +6,9 @@ #include +//!\brief A validator that only accepts integers greater than 0. template - requires std::is_arithmetic_v + requires std::integral struct positive_integer_validator { using option_value_type = option_value_t;