diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b9fe825..19eee5a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,3 +19,5 @@ target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC "${PROJECT_NAME}_interface")
add_executable ("${PROJECT_NAME}" main.cpp)
target_link_libraries ("${PROJECT_NAME}" PRIVATE "${PROJECT_NAME}_lib")
+
+add_subdirectory (tutorial_hibf)
diff --git a/src/tutorial_hibf/CMakeLists.txt b/src/tutorial_hibf/CMakeLists.txt
new file mode 100644
index 0000000..3c3511e
--- /dev/null
+++ b/src/tutorial_hibf/CMakeLists.txt
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
+# SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
+# SPDX-License-Identifier: CC0-1.0
+
+cmake_minimum_required (VERSION 3.16)
+
+add_executable ("hibf_tutorial" main.cpp build.cpp count.cpp search.cpp)
+target_link_libraries ("hibf_tutorial" PRIVATE "${PROJECT_NAME}_interface")
diff --git a/src/tutorial_hibf/README.md b/src/tutorial_hibf/README.md
new file mode 100644
index 0000000..d782405
--- /dev/null
+++ b/src/tutorial_hibf/README.md
@@ -0,0 +1,27 @@
+
+
+# Files for testing
+
+https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz
+
+```
+curl https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz -o tutorial_files.tar.gz
+tar xf tutorial_files.tar.gz
+```
+
+It also contains a script that creates a file that can be used as input for build.
+Note: **BEFORE** executing a script you downloaded from somewhere, you should read the file and check what it does.
+
+```
+./tutorial_files/get_filenames.sh
+```
+
+Input for build: `tutorial_files/filenames.txt`
+Query for search/count: `tutorial_files/reads.fastq`
+
+`reads.fastq` contains 1024 queries. Each has a length of `250`.
+The reads were simulated with `2` errors from their respective input file.
diff --git a/src/tutorial_hibf/build.cpp b/src/tutorial_hibf/build.cpp
new file mode 100644
index 0000000..38c03b4
--- /dev/null
+++ b/src/tutorial_hibf/build.cpp
@@ -0,0 +1,116 @@
+// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: CC0-1.0
+
+#include
+#include
+
+#include
+
+#include
+
+#include "index.hpp"
+#include "sequence_io.hpp"
+#include "validator.hpp"
+
+//!\brief A struct that stores the configuration of the build command.
+struct build_config
+{
+ //!\brief The input file that contains the paths to the files that should be indexed.
+ std::filesystem::path input{};
+ //!\brief The output file where the index should be stored.
+ std::filesystem::path output{};
+ //!\brief The paths to the files that should be indexed.
+ std::vector input_files{};
+ //!\brief The kmer size that should be used for the index.
+ uint8_t kmer{};
+ //!\brief The number of threads that should be used for the index construction.
+ uint8_t threads{1u};
+};
+
+//!\brief Builds an index from the input files and stores it in the output file.
+void build_hibf(build_config & config)
+{
+ // This lambda function will be used for `.input_fn` of the config.
+ auto input_lambda = [&config](size_t const user_bin_index, seqan::hibf::insert_iterator it)
+ {
+ // TODO: Remove the following two lines:
+ (void)user_bin_index; // Suppresses unused variable warning.
+ (void)it; // Suppresses unused variable warning.
+
+ // TODO: Read the sequence from the `user_bin_index`-th input file and assign all k-mers to `it`.
+ };
+
+ // TODO: Remove the following line:
+ (void)input_lambda; // Suppresses unused variable warning.
+
+ // TODO: Create a seqan::hibf::config.
+
+ // TODO: Create a seqan::hibf::hierarchical_interleaved_bloom_filter.
+
+ // TODO: Create a myindex.
+
+ // TODO: Store the myindex in the output file.
+}
+
+//!\brief Reads the input files from the input file and stores them in the config.
+void read_input_files(build_config & config)
+{
+ std::ifstream file{config.input};
+ std::string line{};
+
+ while (std::getline(file, line))
+ config.input_files.emplace_back(line);
+
+ if (config.input_files.empty())
+ throw sharg::validation_error{"No input files found in " + config.input.string() + "."};
+
+ std::ranges::for_each(config.input_files, sharg::input_file_validator{{"fa", "fasta"}});
+}
+
+//!\brief Adds the options to the parser and calls the build function.
+void build(sharg::parser & parser)
+{
+ build_config config{};
+ parser.add_option(config.input,
+ sharg::config{.short_id = 'i',
+ .long_id = "input",
+ .description = "Path to a file containing the paths to the files that should be "
+ "indexed. The file must contain one path per line.",
+ .required = true,
+ .validator = sharg::input_file_validator{}});
+
+ parser.add_option(config.output,
+ sharg::config{.short_id = 'o',
+ .long_id = "output",
+ .description = "Output path where the index should be stored.",
+ .required = true,
+ .validator = sharg::output_file_validator{}});
+
+ parser.add_option(config.kmer,
+ sharg::config{.short_id = 'k',
+ .long_id = "kmer",
+ .description = "The kmer size that should be used for the index.",
+ .required = true,
+ .validator = sharg::arithmetic_range_validator{1, 32}});
+
+ parser.add_option(config.threads,
+ sharg::config{.short_id = 't',
+ .long_id = "threads",
+ .description = "The number of threads that should be used for the index "
+ "construction.",
+ .validator = positive_integer_validator{}});
+
+ try
+ {
+ parser.parse();
+ read_input_files(config);
+ }
+ catch (sharg::parser_error const & ext)
+ {
+ std::cerr << "Parsing error. " << ext.what() << '\n';
+ std::exit(EXIT_FAILURE);
+ }
+
+ build_hibf(config);
+}
diff --git a/src/tutorial_hibf/count.cpp b/src/tutorial_hibf/count.cpp
new file mode 100644
index 0000000..76c07ab
--- /dev/null
+++ b/src/tutorial_hibf/count.cpp
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: CC0-1.0
+
+#include
+#include
+
+#include
+
+#include
+
+#include "index.hpp"
+#include "sequence_io.hpp"
+#include "validator.hpp"
+
+//!\brief A struct that stores the configuration of the count command.
+struct count_config
+{
+ //!\brief The index that should be used for the counting.
+ std::filesystem::path index{};
+ //!\brief The input file that contains the sequences that should be counted.
+ std::filesystem::path query{};
+ //!\brief The output file where the counts should be stored.
+ std::filesystem::path output{};
+ //!\brief If a count is below this threshold, it is not reported.
+ uint16_t threshold{1u};
+};
+
+//!\brief Counts the k-mers in the query sequences using the given index and stores the counts in the output file.
+void count_index(count_config const & config)
+{
+ // TODO: Remove the following line:
+ (void)config; // Suppresses unused variable warning.
+
+ // TODO: Load the index from the given file.
+
+ // TODO:
+ // * Create a counting_agent for the hibf.
+ // * Open the input file for reading IDs and sequences of the queries.
+ // * Open the output file for writing the results.
+ // * Create a std::vector that we use for storing the k-mer hashes.
+
+ // TODO OPTIONAL: The counting_agent will a count for each user bin.
+ // It would be helpful to have some kind of header that tells us which user bin ID corresponds to which file.
+ // E.g.:
+ // #0 /path/to/user_bin_0.fasta
+ // #1 /path/to/user_bin_1.fasta
+ // etc.
+
+ // TODO:
+ // * For each query sequence, calculate the k-mer hashes and determine the k-mer count in each user bin.
+ // * Write the results to the output file: "\t\n"
+ // HINT:
+ // * You can copy the elements of a view into a vector: `vector.assign(view.begin(), view.end())`.
+}
+
+//!\brief Parses the command line arguments and counts the k-mers in the query sequences using the given index.
+void count(sharg::parser & parser)
+{
+ count_config config{};
+ parser.add_option(config.index,
+ sharg::config{.short_id = 'i',
+ .long_id = "index",
+ .description = "The index to count the k-mers in.",
+ .required = true,
+ .validator = sharg::input_file_validator{}});
+
+ parser.add_option(config.query,
+ sharg::config{.short_id = 'q',
+ .long_id = "query",
+ .description = "The input file that contains the sequences that should be counted.",
+ .required = true,
+ .validator = sharg::input_file_validator{{"fq", "fastq"}}});
+
+ parser.add_option(config.output,
+ sharg::config{.short_id = 'o',
+ .long_id = "output",
+ .description = "The output file where the counts should be stored.",
+ .required = true,
+ .validator = sharg::output_file_validator{}});
+
+ parser.add_option(config.threshold,
+ sharg::config{.short_id = '\0',
+ .long_id = "threshold",
+ .description = "Only report counts greather than or equal to this value.",
+ .validator = positive_integer_validator{}});
+
+ try
+ {
+ parser.parse();
+ }
+ catch (sharg::parser_error const & ext)
+ {
+ std::cerr << "Parsing error. " << ext.what() << '\n';
+ std::exit(EXIT_FAILURE);
+ }
+
+ count_index(config);
+}
diff --git a/src/tutorial_hibf/index.hpp b/src/tutorial_hibf/index.hpp
new file mode 100644
index 0000000..3401d31
--- /dev/null
+++ b/src/tutorial_hibf/index.hpp
@@ -0,0 +1,62 @@
+// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: CC0-1.0
+
+#pragma once
+
+#include
+#include
+#include
+
+//!\brief A class that stores the k-mer size, HIBF, the input file paths.
+class myindex
+{
+public:
+ //!\brief The k-mer size that was used for the index.
+ uint8_t kmer{};
+ //!\brief The input files that were used for the index.
+ std::vector input_files{};
+ //!\brief The Hierarchical Interleaved Bloom Filter that was constructed from the input files.
+ seqan::hibf::hierarchical_interleaved_bloom_filter hibf{};
+
+ myindex() = default;
+ myindex & operator=(myindex const &) = default;
+ myindex(myindex const &) = default;
+ myindex(myindex &&) = default;
+ myindex & operator=(myindex &&) = default;
+ ~myindex() = default;
+
+ //!\brief Construct a myindex from the given k-mer size, input files, and HIBF.
+ explicit myindex(uint8_t const kmer,
+ std::vector input_files,
+ seqan::hibf::hierarchical_interleaved_bloom_filter hibf) :
+ kmer{kmer},
+ input_files{std::move(input_files)},
+ hibf{std::move(hibf)}
+ {}
+
+ //!\brief Store myindex in the given file.
+ void store(std::filesystem::path const & path) const
+ {
+ std::ofstream fout{path};
+ cereal::BinaryOutputArchive oarchive{fout};
+ oarchive(*this);
+ }
+
+ //!\brief Load myindex from the given file.
+ void load(std::filesystem::path const & path)
+ {
+ std::ifstream fin{path};
+ cereal::BinaryInputArchive iarchive{fin};
+ iarchive(*this);
+ }
+
+ //!\brief Tells cereal how to serialize myindex.
+ template
+ void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
+ {
+ archive(kmer);
+ archive(input_files);
+ archive(hibf);
+ }
+};
diff --git a/src/tutorial_hibf/main.cpp b/src/tutorial_hibf/main.cpp
new file mode 100644
index 0000000..b9db754
--- /dev/null
+++ b/src/tutorial_hibf/main.cpp
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: CC0-1.0
+
+#include
+
+// These are forward declarations of the functions that will be called by the main function.
+// They are defined in the files build.cpp, search.cpp, and count.cpp.
+void build(sharg::parser &);
+void search(sharg::parser &);
+void count(sharg::parser &);
+
+int main(int argc, char ** argv)
+{
+ // The main executable will have three subcommands: build, search, and count.
+ sharg::parser parser{"hibf", {argv, argv + argc}, sharg::update_notifications::off, {"build", "search", "count"}};
+
+ try
+ {
+ parser.parse();
+ }
+ catch (sharg::parser_error const & ext)
+ {
+ std::cerr << "Parsing error. " << ext.what() << '\n';
+ return -1;
+ }
+
+ auto & subparser = parser.get_sub_parser();
+
+ if (subparser.info.app_name == "hibf-build")
+ {
+ build(subparser);
+ }
+ else if (subparser.info.app_name == "hibf-search")
+ {
+ search(subparser);
+ }
+ else if (subparser.info.app_name == "hibf-count")
+ {
+ count(subparser);
+ }
+ else
+ {
+ std::cerr << "Unknown subcommand: " << subparser.info.app_name << '\n';
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/src/tutorial_hibf/search.cpp b/src/tutorial_hibf/search.cpp
new file mode 100644
index 0000000..d30844c
--- /dev/null
+++ b/src/tutorial_hibf/search.cpp
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: CC0-1.0
+
+#include
+#include
+
+#include
+
+#include
+
+#include "index.hpp"
+#include "sequence_io.hpp"
+
+//!\brief A struct that stores the configuration of the search command.
+struct search_config
+{
+ //!\brief The index that should be used for the search.
+ std::filesystem::path index{};
+ //!\brief The input file that contains the sequences that should be searched.
+ std::filesystem::path query{};
+ //!\brief The output file where the search results should be stored.
+ std::filesystem::path output{};
+ //!\brief When a query sequence has more than this fraction of k-mers in a user bin, the user bin is reported.
+ double threshold{0.7};
+};
+
+//!\brief Searches the k-mers in the query sequences using the given index and stores the results in the output file.
+void search_index(search_config const & config)
+{
+ // TODO: Remove the following line:
+ (void)config; // Suppresses unused variable warning.
+
+ // TODO: Load the index from the given file.
+
+ // TODO:
+ // * Create a membership_agent for the hibf.
+ // * Open the input file for reading IDs and sequences of the queries.
+ // * Open the output file for writing the results.
+ // * Create a std::vector that we use for storing the k-mer hashes.
+
+ // TODO OPTIONAL: The membership_agent will return the user bin IDs.
+ // It would be helpful to have some kind of header that tells us which user bin ID corresponds to which file.
+ // E.g.:
+ // #0 /path/to/user_bin_0.fasta
+ // #1 /path/to/user_bin_1.fasta
+ // etc.
+
+ // TODO:
+ // * For each query sequence, calculate the k-mer hashes and determine in which user bins they occur.
+ // * Write the results to the output file: "\t\n"
+ // HINT:
+ // * You can copy the elements of a view into a vector: `vector.assign(view.begin(), view.end())`.
+ // OPTIONAL:
+ // * Sort the user bin IDs before writing them to the output file.
+}
+
+//!\brief Parses the command line arguments and searches the k-mers in the query sequences using the given index.
+void search(sharg::parser & parser)
+{
+ search_config config{};
+ parser.add_option(config.index,
+ sharg::config{.short_id = 'i',
+ .long_id = "index",
+ .description = "The index to search the k-mers in.",
+ .required = true,
+ .validator = sharg::input_file_validator{}});
+
+ parser.add_option(config.query,
+ sharg::config{.short_id = 'q',
+ .long_id = "query",
+ .description = "The input file containing the sequences that should be searched.",
+ .required = true,
+ .validator = sharg::input_file_validator{{"fq", "fastq"}}});
+
+ parser.add_option(config.output,
+ sharg::config{.short_id = 'o',
+ .long_id = "output",
+ .description = "The output file where the search results should be stored.",
+ .required = true,
+ .validator = sharg::output_file_validator{}});
+
+ parser.add_option(config.threshold,
+ sharg::config{.short_id = '\0',
+ .long_id = "threshold",
+ .description = "Only report user bins where more than this fraction of all k-mers "
+ "in a query sequence are found.",
+ .validator = sharg::arithmetic_range_validator{0.0, 1.0}});
+
+ try
+ {
+ parser.parse();
+ }
+ catch (sharg::parser_error const & ext)
+ {
+ std::cerr << "Parsing error. " << ext.what() << '\n';
+ std::exit(EXIT_FAILURE);
+ }
+
+ search_index(config);
+}
diff --git a/src/tutorial_hibf/sequence_io.hpp b/src/tutorial_hibf/sequence_io.hpp
new file mode 100644
index 0000000..26d8b8d
--- /dev/null
+++ b/src/tutorial_hibf/sequence_io.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: CC0-1.0
+
+#pragma once
+
+#include
+
+//!\brief Tells seqan3::sequence_file_input to use dna4 as the sequence alphabet.
+struct dna4_traits : seqan3::sequence_file_input_default_traits_dna
+{
+ using sequence_alphabet = seqan3::dna4;
+};
+
+//!\brief This seqan3::sequence_file_input will output only the sequence.
+using seq_reader = seqan3::sequence_file_input>;
+//!\brief This seqan3::sequence_file_input will output id and sequence.
+using id_seq_reader = seqan3::sequence_file_input>;
diff --git a/src/tutorial_hibf/validator.hpp b/src/tutorial_hibf/validator.hpp
new file mode 100644
index 0000000..ac4e8df
--- /dev/null
+++ b/src/tutorial_hibf/validator.hpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: CC0-1.0
+
+#pragma once
+
+#include
+
+//!\brief A validator that only accepts integers greater than 0.
+template
+ requires std::integral
+struct positive_integer_validator
+{
+ using option_value_type = option_value_t;
+
+ void operator()(option_value_type const & value) const
+ {
+ if (value < 1)
+ throw sharg::validation_error{"The value must greater than 0."};
+ }
+
+ std::string get_help_page_message() const
+ {
+ return "Value must be greater than 0.";
+ }
+};