Skip to content

Commit

Permalink
[DOC] hibf tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Mar 5, 2024
1 parent 6273694 commit 140e1ee
Show file tree
Hide file tree
Showing 13 changed files with 272 additions and 217 deletions.
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC "${PROJECT_NAME}_interface")
add_executable ("${PROJECT_NAME}" main.cpp)
target_link_libraries ("${PROJECT_NAME}" PRIVATE "${PROJECT_NAME}_lib")

add_subdirectory (tutorial)
add_subdirectory (tutorial_hibf)
7 changes: 0 additions & 7 deletions src/tutorial/CMakeLists.txt

This file was deleted.

94 changes: 0 additions & 94 deletions src/tutorial/count.cpp

This file was deleted.

95 changes: 0 additions & 95 deletions src/tutorial/search.cpp

This file was deleted.

4 changes: 4 additions & 0 deletions src/tutorial_hibf/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cmake_minimum_required (VERSION 3.16)

add_executable ("hibf_tutorial" main.cpp build.cpp count.cpp search.cpp)
target_link_libraries ("hibf_tutorial" PRIVATE "${PROJECT_NAME}_interface")
20 changes: 20 additions & 0 deletions src/tutorial_hibf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Files for testing

https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz

```
curl https://ftp.seqan.de/tutorial/hibf/tutorial_files.tar.gz -o tutorial_files.tar.gz
tar xf tutorial_files.tar.gz
```

It also contains a script that creates a file that can be used as input for build.
(Hint: BEFORE executing a script you downloaded from somewhere, you should read the file and check what it does.)
```
./tutorial_files/get_filenames.sh
```

Input for build: `tutorial_files/filenames.txt`
Query for search/count: `tutorial_files/reads.fastq`

`reads.fastq` contains 1024 queries. Each has a length of `250`.
The reads were simulated with `2` errors from their respective input file.
48 changes: 31 additions & 17 deletions src/tutorial/build.cpp → src/tutorial_hibf/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,47 @@
#include "sequence_io.hpp"
#include "validator.hpp"

//!\brief A struct that stores the configuration of the build command.
struct build_config
{
//!\brief The input file that contains the paths to the files that should be indexed.
std::filesystem::path input{};
std::vector<std::filesystem::path> input_files{};
//!\brief The output file where the index should be stored.
std::filesystem::path output{};
//!\brief The paths to the files that should be indexed.
std::vector<std::filesystem::path> input_files{};
//!\brief The kmer size that should be used for the index.
uint8_t kmer{};
//!\brief The number of threads that should be used for the index construction.
uint8_t threads{1u};
};

//!\brief Builds an index from the input files and stores it in the output file.
void build_hibf(build_config & config)
{
// This lambda function will be used for `.input_fn` of the config.
auto input_lambda = [&config](size_t const user_bin_index, seqan::hibf::insert_iterator it)
{
seq_reader fin{config.input_files[user_bin_index]};
for (auto && [seq] : fin)
for (auto && hash : seq | seqan3::views::kmer_hash(seqan3::ungapped{config.kmer}))
it = hash;
// TODO: Remove the following two lines:
(void)user_bin_index; // Suppresses unused variable warning.
(void)it; // Suppresses unused variable warning.

// TODO: Read the sequence from the `user_bin_index`-th input file and assign all k-mers to `it`.
};

seqan::hibf::config hibf_config{.input_fn = input_lambda,
.number_of_user_bins = config.input_files.size(),
.number_of_hash_functions = 2u,
.maximum_fpr = 0.05,
.threads = config.threads};
// TODO: Remove the following line:
(void)input_lambda; // Suppresses unused variable warning.

// TODO: Create a seqan::hibf::config.

// TODO: Create a seqan::hibf::hierarchical_interleaved_bloom_filter.

// TODO: Create a myindex.

seqan::hibf::hierarchical_interleaved_bloom_filter hibf{hibf_config};
myindex index{config.kmer, std::move(config.input_files), std::move(hibf)};
index.store(config.output);
// TODO: Store the myindex in the output file.
}

//!\brief Reads the input files from the input file and stores them in the config.
void read_input_files(build_config & config)
{
std::ifstream file{config.input};
Expand All @@ -57,34 +68,37 @@ void read_input_files(build_config & config)
std::ranges::for_each(config.input_files, sharg::input_file_validator{{"fa", "fasta"}});
}

//!\brief Adds the options to the parser and calls the build function.
void build(sharg::parser & parser)
{
build_config config{};
parser.add_option(config.input,
sharg::config{.short_id = 'i',
.long_id = "input",
.description = "Input",
.description = "Path to a file containing the paths to the files that should be "
"indexed. The file must contain one path per line.",
.required = true,
.validator = sharg::input_file_validator{}});

parser.add_option(config.output,
sharg::config{.short_id = 'o',
.long_id = "output",
.description = "Output",
.description = "Output path where the index should be stored.",
.required = true,
.validator = sharg::output_file_validator{}});

parser.add_option(config.kmer,
sharg::config{.short_id = 'k',
.long_id = "kmer",
.description = "Kmer",
.description = "The kmer size that should be used for the index.",
.required = true,
.validator = sharg::arithmetic_range_validator{1, 32}});

parser.add_option(config.threads,
sharg::config{.short_id = 't',
.long_id = "threads",
.description = "Threads.",
.description = "The number of threads that should be used for the index "
"construction.",
.validator = positive_integer_validator<decltype(config.threads)>{}});

try
Expand Down
Loading

0 comments on commit 140e1ee

Please sign in to comment.