From d8f9370f9b88b7ef0e683084c4ff37fb30a3a9ae Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 5 Mar 2024 22:18:22 +0100 Subject: [PATCH] [DOC] hibf tutorial --- src/CMakeLists.txt | 2 + src/tutorial_hibf/CMakeLists.txt | 8 +++ src/tutorial_hibf/README.md | 27 +++++++ src/tutorial_hibf/build.cpp | 116 ++++++++++++++++++++++++++++++ src/tutorial_hibf/count.cpp | 99 +++++++++++++++++++++++++ src/tutorial_hibf/index.hpp | 62 ++++++++++++++++ src/tutorial_hibf/main.cpp | 49 +++++++++++++ src/tutorial_hibf/search.cpp | 101 ++++++++++++++++++++++++++ src/tutorial_hibf/sequence_io.hpp | 18 +++++ src/tutorial_hibf/validator.hpp | 26 +++++++ 10 files changed, 508 insertions(+) create mode 100644 src/tutorial_hibf/CMakeLists.txt create mode 100644 src/tutorial_hibf/README.md create mode 100644 src/tutorial_hibf/build.cpp create mode 100644 src/tutorial_hibf/count.cpp create mode 100644 src/tutorial_hibf/index.hpp create mode 100644 src/tutorial_hibf/main.cpp create mode 100644 src/tutorial_hibf/search.cpp create mode 100644 src/tutorial_hibf/sequence_io.hpp create mode 100644 src/tutorial_hibf/validator.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b9fe825..19eee5a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,3 +19,5 @@ target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC "${PROJECT_NAME}_interface") add_executable ("${PROJECT_NAME}" main.cpp) target_link_libraries ("${PROJECT_NAME}" PRIVATE "${PROJECT_NAME}_lib") + +add_subdirectory (tutorial_hibf) diff --git a/src/tutorial_hibf/CMakeLists.txt b/src/tutorial_hibf/CMakeLists.txt new file mode 100644 index 0000000..3c3511e --- /dev/null +++ b/src/tutorial_hibf/CMakeLists.txt @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +# SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +# SPDX-License-Identifier: CC0-1.0 + +cmake_minimum_required (VERSION 3.16) + +add_executable ("hibf_tutorial" main.cpp build.cpp count.cpp search.cpp) +target_link_libraries ("hibf_tutorial" PRIVATE "${PROJECT_NAME}_interface") diff --git a/src/tutorial_hibf/README.md b/src/tutorial_hibf/README.md new file mode 100644 index 0000000..d782405 --- /dev/null +++ b/src/tutorial_hibf/README.md @@ -0,0 +1,27 @@ + + +# Files for testing + +https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz + +``` +curl https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz -o tutorial_files.tar.gz +tar xf tutorial_files.tar.gz +``` + +It also contains a script that creates a file that can be used as input for build.
+Note: **BEFORE** executing a script you downloaded from somewhere, you should read the file and check what it does. + +``` +./tutorial_files/get_filenames.sh +``` + +Input for build: `tutorial_files/filenames.txt`
+Query for search/count: `tutorial_files/reads.fastq` + +`reads.fastq` contains 1024 queries. Each has a length of `250`.
+The reads were simulated with `2` errors from their respective input file. diff --git a/src/tutorial_hibf/build.cpp b/src/tutorial_hibf/build.cpp new file mode 100644 index 0000000..38c03b4 --- /dev/null +++ b/src/tutorial_hibf/build.cpp @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#include +#include + +#include + +#include + +#include "index.hpp" +#include "sequence_io.hpp" +#include "validator.hpp" + +//!\brief A struct that stores the configuration of the build command. +struct build_config +{ + //!\brief The input file that contains the paths to the files that should be indexed. + std::filesystem::path input{}; + //!\brief The output file where the index should be stored. + std::filesystem::path output{}; + //!\brief The paths to the files that should be indexed. + std::vector input_files{}; + //!\brief The kmer size that should be used for the index. + uint8_t kmer{}; + //!\brief The number of threads that should be used for the index construction. + uint8_t threads{1u}; +}; + +//!\brief Builds an index from the input files and stores it in the output file. +void build_hibf(build_config & config) +{ + // This lambda function will be used for `.input_fn` of the config. + auto input_lambda = [&config](size_t const user_bin_index, seqan::hibf::insert_iterator it) + { + // TODO: Remove the following two lines: + (void)user_bin_index; // Suppresses unused variable warning. + (void)it; // Suppresses unused variable warning. + + // TODO: Read the sequence from the `user_bin_index`-th input file and assign all k-mers to `it`. + }; + + // TODO: Remove the following line: + (void)input_lambda; // Suppresses unused variable warning. + + // TODO: Create a seqan::hibf::config. + + // TODO: Create a seqan::hibf::hierarchical_interleaved_bloom_filter. + + // TODO: Create a myindex. + + // TODO: Store the myindex in the output file. +} + +//!\brief Reads the input files from the input file and stores them in the config. +void read_input_files(build_config & config) +{ + std::ifstream file{config.input}; + std::string line{}; + + while (std::getline(file, line)) + config.input_files.emplace_back(line); + + if (config.input_files.empty()) + throw sharg::validation_error{"No input files found in " + config.input.string() + "."}; + + std::ranges::for_each(config.input_files, sharg::input_file_validator{{"fa", "fasta"}}); +} + +//!\brief Adds the options to the parser and calls the build function. +void build(sharg::parser & parser) +{ + build_config config{}; + parser.add_option(config.input, + sharg::config{.short_id = 'i', + .long_id = "input", + .description = "Path to a file containing the paths to the files that should be " + "indexed. The file must contain one path per line.", + .required = true, + .validator = sharg::input_file_validator{}}); + + parser.add_option(config.output, + sharg::config{.short_id = 'o', + .long_id = "output", + .description = "Output path where the index should be stored.", + .required = true, + .validator = sharg::output_file_validator{}}); + + parser.add_option(config.kmer, + sharg::config{.short_id = 'k', + .long_id = "kmer", + .description = "The kmer size that should be used for the index.", + .required = true, + .validator = sharg::arithmetic_range_validator{1, 32}}); + + parser.add_option(config.threads, + sharg::config{.short_id = 't', + .long_id = "threads", + .description = "The number of threads that should be used for the index " + "construction.", + .validator = positive_integer_validator{}}); + + try + { + parser.parse(); + read_input_files(config); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "Parsing error. " << ext.what() << '\n'; + std::exit(EXIT_FAILURE); + } + + build_hibf(config); +} diff --git a/src/tutorial_hibf/count.cpp b/src/tutorial_hibf/count.cpp new file mode 100644 index 0000000..76c07ab --- /dev/null +++ b/src/tutorial_hibf/count.cpp @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#include +#include + +#include + +#include + +#include "index.hpp" +#include "sequence_io.hpp" +#include "validator.hpp" + +//!\brief A struct that stores the configuration of the count command. +struct count_config +{ + //!\brief The index that should be used for the counting. + std::filesystem::path index{}; + //!\brief The input file that contains the sequences that should be counted. + std::filesystem::path query{}; + //!\brief The output file where the counts should be stored. + std::filesystem::path output{}; + //!\brief If a count is below this threshold, it is not reported. + uint16_t threshold{1u}; +}; + +//!\brief Counts the k-mers in the query sequences using the given index and stores the counts in the output file. +void count_index(count_config const & config) +{ + // TODO: Remove the following line: + (void)config; // Suppresses unused variable warning. + + // TODO: Load the index from the given file. + + // TODO: + // * Create a counting_agent for the hibf. + // * Open the input file for reading IDs and sequences of the queries. + // * Open the output file for writing the results. + // * Create a std::vector that we use for storing the k-mer hashes. + + // TODO OPTIONAL: The counting_agent will a count for each user bin. + // It would be helpful to have some kind of header that tells us which user bin ID corresponds to which file. + // E.g.: + // #0 /path/to/user_bin_0.fasta + // #1 /path/to/user_bin_1.fasta + // etc. + + // TODO: + // * For each query sequence, calculate the k-mer hashes and determine the k-mer count in each user bin. + // * Write the results to the output file: "\t\n" + // HINT: + // * You can copy the elements of a view into a vector: `vector.assign(view.begin(), view.end())`. +} + +//!\brief Parses the command line arguments and counts the k-mers in the query sequences using the given index. +void count(sharg::parser & parser) +{ + count_config config{}; + parser.add_option(config.index, + sharg::config{.short_id = 'i', + .long_id = "index", + .description = "The index to count the k-mers in.", + .required = true, + .validator = sharg::input_file_validator{}}); + + parser.add_option(config.query, + sharg::config{.short_id = 'q', + .long_id = "query", + .description = "The input file that contains the sequences that should be counted.", + .required = true, + .validator = sharg::input_file_validator{{"fq", "fastq"}}}); + + parser.add_option(config.output, + sharg::config{.short_id = 'o', + .long_id = "output", + .description = "The output file where the counts should be stored.", + .required = true, + .validator = sharg::output_file_validator{}}); + + parser.add_option(config.threshold, + sharg::config{.short_id = '\0', + .long_id = "threshold", + .description = "Only report counts greather than or equal to this value.", + .validator = positive_integer_validator{}}); + + try + { + parser.parse(); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "Parsing error. " << ext.what() << '\n'; + std::exit(EXIT_FAILURE); + } + + count_index(config); +} diff --git a/src/tutorial_hibf/index.hpp b/src/tutorial_hibf/index.hpp new file mode 100644 index 0000000..3401d31 --- /dev/null +++ b/src/tutorial_hibf/index.hpp @@ -0,0 +1,62 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#pragma once + +#include +#include +#include + +//!\brief A class that stores the k-mer size, HIBF, the input file paths. +class myindex +{ +public: + //!\brief The k-mer size that was used for the index. + uint8_t kmer{}; + //!\brief The input files that were used for the index. + std::vector input_files{}; + //!\brief The Hierarchical Interleaved Bloom Filter that was constructed from the input files. + seqan::hibf::hierarchical_interleaved_bloom_filter hibf{}; + + myindex() = default; + myindex & operator=(myindex const &) = default; + myindex(myindex const &) = default; + myindex(myindex &&) = default; + myindex & operator=(myindex &&) = default; + ~myindex() = default; + + //!\brief Construct a myindex from the given k-mer size, input files, and HIBF. + explicit myindex(uint8_t const kmer, + std::vector input_files, + seqan::hibf::hierarchical_interleaved_bloom_filter hibf) : + kmer{kmer}, + input_files{std::move(input_files)}, + hibf{std::move(hibf)} + {} + + //!\brief Store myindex in the given file. + void store(std::filesystem::path const & path) const + { + std::ofstream fout{path}; + cereal::BinaryOutputArchive oarchive{fout}; + oarchive(*this); + } + + //!\brief Load myindex from the given file. + void load(std::filesystem::path const & path) + { + std::ifstream fin{path}; + cereal::BinaryInputArchive iarchive{fin}; + iarchive(*this); + } + + //!\brief Tells cereal how to serialize myindex. + template + void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) + { + archive(kmer); + archive(input_files); + archive(hibf); + } +}; diff --git a/src/tutorial_hibf/main.cpp b/src/tutorial_hibf/main.cpp new file mode 100644 index 0000000..b9db754 --- /dev/null +++ b/src/tutorial_hibf/main.cpp @@ -0,0 +1,49 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#include + +// These are forward declarations of the functions that will be called by the main function. +// They are defined in the files build.cpp, search.cpp, and count.cpp. +void build(sharg::parser &); +void search(sharg::parser &); +void count(sharg::parser &); + +int main(int argc, char ** argv) +{ + // The main executable will have three subcommands: build, search, and count. + sharg::parser parser{"hibf", {argv, argv + argc}, sharg::update_notifications::off, {"build", "search", "count"}}; + + try + { + parser.parse(); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "Parsing error. " << ext.what() << '\n'; + return -1; + } + + auto & subparser = parser.get_sub_parser(); + + if (subparser.info.app_name == "hibf-build") + { + build(subparser); + } + else if (subparser.info.app_name == "hibf-search") + { + search(subparser); + } + else if (subparser.info.app_name == "hibf-count") + { + count(subparser); + } + else + { + std::cerr << "Unknown subcommand: " << subparser.info.app_name << '\n'; + return -1; + } + + return 0; +} diff --git a/src/tutorial_hibf/search.cpp b/src/tutorial_hibf/search.cpp new file mode 100644 index 0000000..d30844c --- /dev/null +++ b/src/tutorial_hibf/search.cpp @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#include +#include + +#include + +#include + +#include "index.hpp" +#include "sequence_io.hpp" + +//!\brief A struct that stores the configuration of the search command. +struct search_config +{ + //!\brief The index that should be used for the search. + std::filesystem::path index{}; + //!\brief The input file that contains the sequences that should be searched. + std::filesystem::path query{}; + //!\brief The output file where the search results should be stored. + std::filesystem::path output{}; + //!\brief When a query sequence has more than this fraction of k-mers in a user bin, the user bin is reported. + double threshold{0.7}; +}; + +//!\brief Searches the k-mers in the query sequences using the given index and stores the results in the output file. +void search_index(search_config const & config) +{ + // TODO: Remove the following line: + (void)config; // Suppresses unused variable warning. + + // TODO: Load the index from the given file. + + // TODO: + // * Create a membership_agent for the hibf. + // * Open the input file for reading IDs and sequences of the queries. + // * Open the output file for writing the results. + // * Create a std::vector that we use for storing the k-mer hashes. + + // TODO OPTIONAL: The membership_agent will return the user bin IDs. + // It would be helpful to have some kind of header that tells us which user bin ID corresponds to which file. + // E.g.: + // #0 /path/to/user_bin_0.fasta + // #1 /path/to/user_bin_1.fasta + // etc. + + // TODO: + // * For each query sequence, calculate the k-mer hashes and determine in which user bins they occur. + // * Write the results to the output file: "\t\n" + // HINT: + // * You can copy the elements of a view into a vector: `vector.assign(view.begin(), view.end())`. + // OPTIONAL: + // * Sort the user bin IDs before writing them to the output file. +} + +//!\brief Parses the command line arguments and searches the k-mers in the query sequences using the given index. +void search(sharg::parser & parser) +{ + search_config config{}; + parser.add_option(config.index, + sharg::config{.short_id = 'i', + .long_id = "index", + .description = "The index to search the k-mers in.", + .required = true, + .validator = sharg::input_file_validator{}}); + + parser.add_option(config.query, + sharg::config{.short_id = 'q', + .long_id = "query", + .description = "The input file containing the sequences that should be searched.", + .required = true, + .validator = sharg::input_file_validator{{"fq", "fastq"}}}); + + parser.add_option(config.output, + sharg::config{.short_id = 'o', + .long_id = "output", + .description = "The output file where the search results should be stored.", + .required = true, + .validator = sharg::output_file_validator{}}); + + parser.add_option(config.threshold, + sharg::config{.short_id = '\0', + .long_id = "threshold", + .description = "Only report user bins where more than this fraction of all k-mers " + "in a query sequence are found.", + .validator = sharg::arithmetic_range_validator{0.0, 1.0}}); + + try + { + parser.parse(); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "Parsing error. " << ext.what() << '\n'; + std::exit(EXIT_FAILURE); + } + + search_index(config); +} diff --git a/src/tutorial_hibf/sequence_io.hpp b/src/tutorial_hibf/sequence_io.hpp new file mode 100644 index 0000000..26d8b8d --- /dev/null +++ b/src/tutorial_hibf/sequence_io.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#pragma once + +#include + +//!\brief Tells seqan3::sequence_file_input to use dna4 as the sequence alphabet. +struct dna4_traits : seqan3::sequence_file_input_default_traits_dna +{ + using sequence_alphabet = seqan3::dna4; +}; + +//!\brief This seqan3::sequence_file_input will output only the sequence. +using seq_reader = seqan3::sequence_file_input>; +//!\brief This seqan3::sequence_file_input will output id and sequence. +using id_seq_reader = seqan3::sequence_file_input>; diff --git a/src/tutorial_hibf/validator.hpp b/src/tutorial_hibf/validator.hpp new file mode 100644 index 0000000..ac4e8df --- /dev/null +++ b/src/tutorial_hibf/validator.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: CC0-1.0 + +#pragma once + +#include + +//!\brief A validator that only accepts integers greater than 0. +template + requires std::integral +struct positive_integer_validator +{ + using option_value_type = option_value_t; + + void operator()(option_value_type const & value) const + { + if (value < 1) + throw sharg::validation_error{"The value must greater than 0."}; + } + + std::string get_help_page_message() const + { + return "Value must be greater than 0."; + } +};