Skip to content

Commit

Permalink
[DOC] hibf tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Mar 5, 2024
1 parent 2d6439a commit 06c2f07
Show file tree
Hide file tree
Showing 10 changed files with 508 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC "${PROJECT_NAME}_interface")

add_executable ("${PROJECT_NAME}" main.cpp)
target_link_libraries ("${PROJECT_NAME}" PRIVATE "${PROJECT_NAME}_lib")

add_subdirectory (tutorial_hibf)
8 changes: 8 additions & 0 deletions src/tutorial_hibf/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
# SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
# SPDX-License-Identifier: CC0-1.0

cmake_minimum_required (VERSION 3.16)

add_executable ("hibf_tutorial" main.cpp build.cpp count.cpp search.cpp)
target_link_libraries ("hibf_tutorial" PRIVATE "${PROJECT_NAME}_interface")
27 changes: 27 additions & 0 deletions src/tutorial_hibf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<!--
SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
SPDX-License-Identifier: CC0-1.0
-->

# Files for testing

https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz

```
curl https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz -o tutorial_files.tar.gz
tar xf tutorial_files.tar.gz
```

It also contains a script that creates a file that can be used as input for build.<br>
Note: **BEFORE** executing a script you downloaded from somewhere, you should read the file and check what it does.

```
./tutorial_files/get_filenames.sh
```

Input for build: `tutorial_files/filenames.txt`<br>
Query for search/count: `tutorial_files/reads.fastq`

`reads.fastq` contains 1024 queries. Each has a length of `250`.<br>
The reads were simulated with `2` errors from their respective input file.
116 changes: 116 additions & 0 deletions src/tutorial_hibf/build.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0

#include <cstdint>
#include <filesystem>

#include <sharg/all.hpp>

#include <seqan3/search/views/kmer_hash.hpp>

#include "index.hpp"
#include "sequence_io.hpp"
#include "validator.hpp"

//!\brief A struct that stores the configuration of the build command.
struct build_config
{
//!\brief The input file that contains the paths to the files that should be indexed.
std::filesystem::path input{};
//!\brief The output file where the index should be stored.
std::filesystem::path output{};
//!\brief The paths to the files that should be indexed.
std::vector<std::filesystem::path> input_files{};
//!\brief The kmer size that should be used for the index.
uint8_t kmer{};
//!\brief The number of threads that should be used for the index construction.
uint8_t threads{1u};
};

//!\brief Builds an index from the input files and stores it in the output file.
void build_hibf(build_config & config)
{
// This lambda function will be used for `.input_fn` of the config.
auto input_lambda = [&config](size_t const user_bin_index, seqan::hibf::insert_iterator it)
{
// TODO: Remove the following two lines:
(void)user_bin_index; // Suppresses unused variable warning.
(void)it; // Suppresses unused variable warning.

// TODO: Read the sequence from the `user_bin_index`-th input file and assign all k-mers to `it`.
};

// TODO: Remove the following line:
(void)input_lambda; // Suppresses unused variable warning.

// TODO: Create a seqan::hibf::config.

// TODO: Create a seqan::hibf::hierarchical_interleaved_bloom_filter.

// TODO: Create a myindex.

// TODO: Store the myindex in the output file.
}

//!\brief Reads the input files from the input file and stores them in the config.
void read_input_files(build_config & config)
{
std::ifstream file{config.input};
std::string line{};

while (std::getline(file, line))
config.input_files.emplace_back(line);

if (config.input_files.empty())
throw sharg::validation_error{"No input files found in " + config.input.string() + "."};

std::ranges::for_each(config.input_files, sharg::input_file_validator{{"fa", "fasta"}});
}

//!\brief Adds the options to the parser and calls the build function.
void build(sharg::parser & parser)
{
build_config config{};
parser.add_option(config.input,
sharg::config{.short_id = 'i',
.long_id = "input",
.description = "Path to a file containing the paths to the files that should be "
"indexed. The file must contain one path per line.",
.required = true,
.validator = sharg::input_file_validator{}});

parser.add_option(config.output,
sharg::config{.short_id = 'o',
.long_id = "output",
.description = "Output path where the index should be stored.",
.required = true,
.validator = sharg::output_file_validator{}});

parser.add_option(config.kmer,
sharg::config{.short_id = 'k',
.long_id = "kmer",
.description = "The kmer size that should be used for the index.",
.required = true,
.validator = sharg::arithmetic_range_validator{1, 32}});

parser.add_option(config.threads,
sharg::config{.short_id = 't',
.long_id = "threads",
.description = "The number of threads that should be used for the index "
"construction.",
.validator = positive_integer_validator<decltype(config.threads)>{}});

try
{
parser.parse();
read_input_files(config);
}
catch (sharg::parser_error const & ext)
{
std::cerr << "Parsing error. " << ext.what() << '\n';
std::exit(EXIT_FAILURE);
}

build_hibf(config);
}
99 changes: 99 additions & 0 deletions src/tutorial_hibf/count.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0

#include <cstdint>
#include <filesystem>

#include <sharg/all.hpp>

#include <seqan3/search/views/kmer_hash.hpp>

#include "index.hpp"
#include "sequence_io.hpp"
#include "validator.hpp"

//!\brief A struct that stores the configuration of the count command.
struct count_config
{
//!\brief The index that should be used for the counting.
std::filesystem::path index{};
//!\brief The input file that contains the sequences that should be counted.
std::filesystem::path query{};
//!\brief The output file where the counts should be stored.
std::filesystem::path output{};
//!\brief If a count is below this threshold, it is not reported.
uint16_t threshold{1u};
};

//!\brief Counts the k-mers in the query sequences using the given index and stores the counts in the output file.
void count_index(count_config const & config)
{
// TODO: Remove the following line:
(void)config; // Suppresses unused variable warning.

// TODO: Load the index from the given file.

// TODO:
// * Create a counting_agent for the hibf.
// * Open the input file for reading IDs and sequences of the queries.
// * Open the output file for writing the results.
// * Create a std::vector that we use for storing the k-mer hashes.

// TODO OPTIONAL: The counting_agent will a count for each user bin.
// It would be helpful to have some kind of header that tells us which user bin ID corresponds to which file.
// E.g.:
// #0 /path/to/user_bin_0.fasta
// #1 /path/to/user_bin_1.fasta
// etc.

// TODO:
// * For each query sequence, calculate the k-mer hashes and determine the k-mer count in each user bin.
// * Write the results to the output file: "<ID>\t<count for each user bin separated by space>\n"
// HINT:
// * You can copy the elements of a view into a vector: `vector.assign(view.begin(), view.end())`.
}

//!\brief Parses the command line arguments and counts the k-mers in the query sequences using the given index.
void count(sharg::parser & parser)
{
count_config config{};
parser.add_option(config.index,
sharg::config{.short_id = 'i',
.long_id = "index",
.description = "The index to count the k-mers in.",
.required = true,
.validator = sharg::input_file_validator{}});

parser.add_option(config.query,
sharg::config{.short_id = 'q',
.long_id = "query",
.description = "The input file that contains the sequences that should be counted.",
.required = true,
.validator = sharg::input_file_validator{{"fq", "fastq"}}});

parser.add_option(config.output,
sharg::config{.short_id = 'o',
.long_id = "output",
.description = "The output file where the counts should be stored.",
.required = true,
.validator = sharg::output_file_validator{}});

parser.add_option(config.threshold,
sharg::config{.short_id = '\0',
.long_id = "threshold",
.description = "Only report counts greather than or equal to this value.",
.validator = positive_integer_validator<decltype(config.threshold)>{}});

try
{
parser.parse();
}
catch (sharg::parser_error const & ext)
{
std::cerr << "Parsing error. " << ext.what() << '\n';
std::exit(EXIT_FAILURE);
}

count_index(config);
}
62 changes: 62 additions & 0 deletions src/tutorial_hibf/index.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0

#pragma once

#include <hibf/cereal/path.hpp>
#include <hibf/config.hpp>
#include <hibf/hierarchical_interleaved_bloom_filter.hpp>

//!\brief A class that stores the k-mer size, HIBF, the input file paths.
class myindex
{
public:
//!\brief The k-mer size that was used for the index.
uint8_t kmer{};
//!\brief The input files that were used for the index.
std::vector<std::filesystem::path> input_files{};
//!\brief The Hierarchical Interleaved Bloom Filter that was constructed from the input files.
seqan::hibf::hierarchical_interleaved_bloom_filter hibf{};

myindex() = default;
myindex & operator=(myindex const &) = default;
myindex(myindex const &) = default;
myindex(myindex &&) = default;
myindex & operator=(myindex &&) = default;
~myindex() = default;

//!\brief Construct a myindex from the given k-mer size, input files, and HIBF.
explicit myindex(uint8_t const kmer,
std::vector<std::filesystem::path> input_files,
seqan::hibf::hierarchical_interleaved_bloom_filter hibf) :
kmer{kmer},
input_files{std::move(input_files)},
hibf{std::move(hibf)}
{}

//!\brief Store myindex in the given file.
void store(std::filesystem::path const & path) const
{
std::ofstream fout{path};
cereal::BinaryOutputArchive oarchive{fout};
oarchive(*this);
}

//!\brief Load myindex from the given file.
void load(std::filesystem::path const & path)
{
std::ifstream fin{path};
cereal::BinaryInputArchive iarchive{fin};
iarchive(*this);
}

//!\brief Tells cereal how to serialize myindex.
template <typename archive_t>
void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
{
archive(kmer);
archive(input_files);
archive(hibf);
}
};
49 changes: 49 additions & 0 deletions src/tutorial_hibf/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0

#include <sharg/all.hpp>

// These are forward declarations of the functions that will be called by the main function.
// They are defined in the files build.cpp, search.cpp, and count.cpp.
void build(sharg::parser &);
void search(sharg::parser &);
void count(sharg::parser &);

int main(int argc, char ** argv)
{
// The main executable will have three subcommands: build, search, and count.
sharg::parser parser{"hibf", {argv, argv + argc}, sharg::update_notifications::off, {"build", "search", "count"}};

try
{
parser.parse();
}
catch (sharg::parser_error const & ext)
{
std::cerr << "Parsing error. " << ext.what() << '\n';
return -1;
}

auto & subparser = parser.get_sub_parser();

if (subparser.info.app_name == "hibf-build")
{
build(subparser);
}
else if (subparser.info.app_name == "hibf-search")
{
search(subparser);
}
else if (subparser.info.app_name == "hibf-count")
{
count(subparser);
}
else
{
std::cerr << "Unknown subcommand: " << subparser.info.app_name << '\n';
return -1;
}

return 0;
}
Loading

0 comments on commit 06c2f07

Please sign in to comment.