-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
508 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin | ||
# SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik | ||
# SPDX-License-Identifier: CC0-1.0 | ||
|
||
cmake_minimum_required (VERSION 3.16) | ||
|
||
add_executable ("hibf_tutorial" main.cpp build.cpp count.cpp search.cpp) | ||
target_link_libraries ("hibf_tutorial" PRIVATE "${PROJECT_NAME}_interface") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
<!-- | ||
SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin | ||
SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik | ||
SPDX-License-Identifier: CC0-1.0 | ||
--> | ||
|
||
# Files for testing | ||
|
||
https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz | ||
|
||
``` | ||
curl https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz -o tutorial_files.tar.gz | ||
tar xf tutorial_files.tar.gz | ||
``` | ||
|
||
It also contains a script that creates a file that can be used as input for build.<br> | ||
Note: **BEFORE** executing a script you downloaded from somewhere, you should read the file and check what it does. | ||
|
||
``` | ||
./tutorial_files/get_filenames.sh | ||
``` | ||
|
||
Input for build: `tutorial_files/filenames.txt`<br> | ||
Query for search/count: `tutorial_files/reads.fastq` | ||
|
||
`reads.fastq` contains 1024 queries. Each has a length of `250`.<br> | ||
The reads were simulated with `2` errors from their respective input file. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin | ||
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik | ||
// SPDX-License-Identifier: CC0-1.0 | ||
|
||
#include <cstdint> | ||
#include <filesystem> | ||
|
||
#include <sharg/all.hpp> | ||
|
||
#include <seqan3/search/views/kmer_hash.hpp> | ||
|
||
#include "index.hpp" | ||
#include "sequence_io.hpp" | ||
#include "validator.hpp" | ||
|
||
//!\brief A struct that stores the configuration of the build command. | ||
struct build_config | ||
{ | ||
//!\brief The input file that contains the paths to the files that should be indexed. | ||
std::filesystem::path input{}; | ||
//!\brief The output file where the index should be stored. | ||
std::filesystem::path output{}; | ||
//!\brief The paths to the files that should be indexed. | ||
std::vector<std::filesystem::path> input_files{}; | ||
//!\brief The kmer size that should be used for the index. | ||
uint8_t kmer{}; | ||
//!\brief The number of threads that should be used for the index construction. | ||
uint8_t threads{1u}; | ||
}; | ||
|
||
//!\brief Builds an index from the input files and stores it in the output file. | ||
void build_hibf(build_config & config) | ||
{ | ||
// This lambda function will be used for `.input_fn` of the config. | ||
auto input_lambda = [&config](size_t const user_bin_index, seqan::hibf::insert_iterator it) | ||
{ | ||
// TODO: Remove the following two lines: | ||
(void)user_bin_index; // Suppresses unused variable warning. | ||
(void)it; // Suppresses unused variable warning. | ||
|
||
// TODO: Read the sequence from the `user_bin_index`-th input file and assign all k-mers to `it`. | ||
}; | ||
|
||
// TODO: Remove the following line: | ||
(void)input_lambda; // Suppresses unused variable warning. | ||
|
||
// TODO: Create a seqan::hibf::config. | ||
|
||
// TODO: Create a seqan::hibf::hierarchical_interleaved_bloom_filter. | ||
|
||
// TODO: Create a myindex. | ||
|
||
// TODO: Store the myindex in the output file. | ||
} | ||
|
||
//!\brief Reads the input files from the input file and stores them in the config. | ||
void read_input_files(build_config & config) | ||
{ | ||
std::ifstream file{config.input}; | ||
std::string line{}; | ||
|
||
while (std::getline(file, line)) | ||
config.input_files.emplace_back(line); | ||
|
||
if (config.input_files.empty()) | ||
throw sharg::validation_error{"No input files found in " + config.input.string() + "."}; | ||
|
||
std::ranges::for_each(config.input_files, sharg::input_file_validator{{"fa", "fasta"}}); | ||
} | ||
|
||
//!\brief Adds the options to the parser and calls the build function. | ||
void build(sharg::parser & parser) | ||
{ | ||
build_config config{}; | ||
parser.add_option(config.input, | ||
sharg::config{.short_id = 'i', | ||
.long_id = "input", | ||
.description = "Path to a file containing the paths to the files that should be " | ||
"indexed. The file must contain one path per line.", | ||
.required = true, | ||
.validator = sharg::input_file_validator{}}); | ||
|
||
parser.add_option(config.output, | ||
sharg::config{.short_id = 'o', | ||
.long_id = "output", | ||
.description = "Output path where the index should be stored.", | ||
.required = true, | ||
.validator = sharg::output_file_validator{}}); | ||
|
||
parser.add_option(config.kmer, | ||
sharg::config{.short_id = 'k', | ||
.long_id = "kmer", | ||
.description = "The kmer size that should be used for the index.", | ||
.required = true, | ||
.validator = sharg::arithmetic_range_validator{1, 32}}); | ||
|
||
parser.add_option(config.threads, | ||
sharg::config{.short_id = 't', | ||
.long_id = "threads", | ||
.description = "The number of threads that should be used for the index " | ||
"construction.", | ||
.validator = positive_integer_validator<decltype(config.threads)>{}}); | ||
|
||
try | ||
{ | ||
parser.parse(); | ||
read_input_files(config); | ||
} | ||
catch (sharg::parser_error const & ext) | ||
{ | ||
std::cerr << "Parsing error. " << ext.what() << '\n'; | ||
std::exit(EXIT_FAILURE); | ||
} | ||
|
||
build_hibf(config); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin | ||
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik | ||
// SPDX-License-Identifier: CC0-1.0 | ||
|
||
#include <cstdint> | ||
#include <filesystem> | ||
|
||
#include <sharg/all.hpp> | ||
|
||
#include <seqan3/search/views/kmer_hash.hpp> | ||
|
||
#include "index.hpp" | ||
#include "sequence_io.hpp" | ||
#include "validator.hpp" | ||
|
||
//!\brief A struct that stores the configuration of the count command. | ||
struct count_config | ||
{ | ||
//!\brief The index that should be used for the counting. | ||
std::filesystem::path index{}; | ||
//!\brief The input file that contains the sequences that should be counted. | ||
std::filesystem::path query{}; | ||
//!\brief The output file where the counts should be stored. | ||
std::filesystem::path output{}; | ||
//!\brief If a count is below this threshold, it is not reported. | ||
uint16_t threshold{1u}; | ||
}; | ||
|
||
//!\brief Counts the k-mers in the query sequences using the given index and stores the counts in the output file. | ||
void count_index(count_config const & config) | ||
{ | ||
// TODO: Remove the following line: | ||
(void)config; // Suppresses unused variable warning. | ||
|
||
// TODO: Load the index from the given file. | ||
|
||
// TODO: | ||
// * Create a counting_agent for the hibf. | ||
// * Open the input file for reading IDs and sequences of the queries. | ||
// * Open the output file for writing the results. | ||
// * Create a std::vector that we use for storing the k-mer hashes. | ||
|
||
// TODO OPTIONAL: The counting_agent will a count for each user bin. | ||
// It would be helpful to have some kind of header that tells us which user bin ID corresponds to which file. | ||
// E.g.: | ||
// #0 /path/to/user_bin_0.fasta | ||
// #1 /path/to/user_bin_1.fasta | ||
// etc. | ||
|
||
// TODO: | ||
// * For each query sequence, calculate the k-mer hashes and determine the k-mer count in each user bin. | ||
// * Write the results to the output file: "<ID>\t<count for each user bin separated by space>\n" | ||
// HINT: | ||
// * You can copy the elements of a view into a vector: `vector.assign(view.begin(), view.end())`. | ||
} | ||
|
||
//!\brief Parses the command line arguments and counts the k-mers in the query sequences using the given index. | ||
void count(sharg::parser & parser) | ||
{ | ||
count_config config{}; | ||
parser.add_option(config.index, | ||
sharg::config{.short_id = 'i', | ||
.long_id = "index", | ||
.description = "The index to count the k-mers in.", | ||
.required = true, | ||
.validator = sharg::input_file_validator{}}); | ||
|
||
parser.add_option(config.query, | ||
sharg::config{.short_id = 'q', | ||
.long_id = "query", | ||
.description = "The input file that contains the sequences that should be counted.", | ||
.required = true, | ||
.validator = sharg::input_file_validator{{"fq", "fastq"}}}); | ||
|
||
parser.add_option(config.output, | ||
sharg::config{.short_id = 'o', | ||
.long_id = "output", | ||
.description = "The output file where the counts should be stored.", | ||
.required = true, | ||
.validator = sharg::output_file_validator{}}); | ||
|
||
parser.add_option(config.threshold, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "threshold", | ||
.description = "Only report counts greather than or equal to this value.", | ||
.validator = positive_integer_validator<decltype(config.threshold)>{}}); | ||
|
||
try | ||
{ | ||
parser.parse(); | ||
} | ||
catch (sharg::parser_error const & ext) | ||
{ | ||
std::cerr << "Parsing error. " << ext.what() << '\n'; | ||
std::exit(EXIT_FAILURE); | ||
} | ||
|
||
count_index(config); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin | ||
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik | ||
// SPDX-License-Identifier: CC0-1.0 | ||
|
||
#pragma once | ||
|
||
#include <hibf/cereal/path.hpp> | ||
#include <hibf/config.hpp> | ||
#include <hibf/hierarchical_interleaved_bloom_filter.hpp> | ||
|
||
//!\brief A class that stores the k-mer size, HIBF, the input file paths. | ||
class myindex | ||
{ | ||
public: | ||
//!\brief The k-mer size that was used for the index. | ||
uint8_t kmer{}; | ||
//!\brief The input files that were used for the index. | ||
std::vector<std::filesystem::path> input_files{}; | ||
//!\brief The Hierarchical Interleaved Bloom Filter that was constructed from the input files. | ||
seqan::hibf::hierarchical_interleaved_bloom_filter hibf{}; | ||
|
||
myindex() = default; | ||
myindex & operator=(myindex const &) = default; | ||
myindex(myindex const &) = default; | ||
myindex(myindex &&) = default; | ||
myindex & operator=(myindex &&) = default; | ||
~myindex() = default; | ||
|
||
//!\brief Construct a myindex from the given k-mer size, input files, and HIBF. | ||
explicit myindex(uint8_t const kmer, | ||
std::vector<std::filesystem::path> input_files, | ||
seqan::hibf::hierarchical_interleaved_bloom_filter hibf) : | ||
kmer{kmer}, | ||
input_files{std::move(input_files)}, | ||
hibf{std::move(hibf)} | ||
{} | ||
|
||
//!\brief Store myindex in the given file. | ||
void store(std::filesystem::path const & path) const | ||
{ | ||
std::ofstream fout{path}; | ||
cereal::BinaryOutputArchive oarchive{fout}; | ||
oarchive(*this); | ||
} | ||
|
||
//!\brief Load myindex from the given file. | ||
void load(std::filesystem::path const & path) | ||
{ | ||
std::ifstream fin{path}; | ||
cereal::BinaryInputArchive iarchive{fin}; | ||
iarchive(*this); | ||
} | ||
|
||
//!\brief Tells cereal how to serialize myindex. | ||
template <typename archive_t> | ||
void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) | ||
{ | ||
archive(kmer); | ||
archive(input_files); | ||
archive(hibf); | ||
} | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin | ||
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik | ||
// SPDX-License-Identifier: CC0-1.0 | ||
|
||
#include <sharg/all.hpp> | ||
|
||
// These are forward declarations of the functions that will be called by the main function. | ||
// They are defined in the files build.cpp, search.cpp, and count.cpp. | ||
void build(sharg::parser &); | ||
void search(sharg::parser &); | ||
void count(sharg::parser &); | ||
|
||
int main(int argc, char ** argv) | ||
{ | ||
// The main executable will have three subcommands: build, search, and count. | ||
sharg::parser parser{"hibf", {argv, argv + argc}, sharg::update_notifications::off, {"build", "search", "count"}}; | ||
|
||
try | ||
{ | ||
parser.parse(); | ||
} | ||
catch (sharg::parser_error const & ext) | ||
{ | ||
std::cerr << "Parsing error. " << ext.what() << '\n'; | ||
return -1; | ||
} | ||
|
||
auto & subparser = parser.get_sub_parser(); | ||
|
||
if (subparser.info.app_name == "hibf-build") | ||
{ | ||
build(subparser); | ||
} | ||
else if (subparser.info.app_name == "hibf-search") | ||
{ | ||
search(subparser); | ||
} | ||
else if (subparser.info.app_name == "hibf-count") | ||
{ | ||
count(subparser); | ||
} | ||
else | ||
{ | ||
std::cerr << "Unknown subcommand: " << subparser.info.app_name << '\n'; | ||
return -1; | ||
} | ||
|
||
return 0; | ||
} |
Oops, something went wrong.