From b107b601d8f48c779c0ba4866eb8a03929903251 Mon Sep 17 00:00:00 2001 From: juli-p <63345753+juli-p@users.noreply.github.com> Date: Fri, 22 Sep 2023 16:51:09 +0200 Subject: [PATCH] calculate_scores: implemented phenotype shuffling, silent SNP filtering --- README.md | 4 ++-- src/data_model/SNPStorage.cpp | 4 ++++ src/data_model/SNPStorage.hpp | 3 +++ src/data_model/SNPStorage.ipp | 5 +++++ src/data_model/SNP_t.cpp | 6 ++++++ src/jobs/InstanceLoader.cpp | 5 +++++ src/jobs/ReadLINDENSets.cpp | 21 ++++++++++++++++++--- src/jobs/ReadLINDENSets.hpp | 3 ++- src/jobs/ReadMACOEDSets.cpp | 23 +++++++++++++++++++---- src/jobs/ReadMACOEDSets.hpp | 3 ++- src/jobs/ReadNeEDLSets.cpp | 19 +++++++++++++------ src/jobs/ReadNeEDLSets.hpp | 3 ++- src/jobs/ShufflePhenotypes.cpp | 21 +++++++++++++++++++++ src/jobs/ShufflePhenotypes.hpp | 23 +++++++++++++++++++++++ src/pipelines/HelperPipelines.cpp | 7 +++++++ src/pipelines/HelperPipelines.hpp | 1 + test/model/src/NeEDL.cpp | 6 +++++- test/model/src/calculate_scores.cpp | 14 +++++++++++--- 18 files changed, 149 insertions(+), 22 deletions(-) create mode 100644 src/jobs/ShufflePhenotypes.cpp create mode 100644 src/jobs/ShufflePhenotypes.hpp diff --git a/README.md b/README.md index adebccf92..37da5ed0d 100644 --- a/README.md +++ b/README.md @@ -650,10 +650,10 @@ With calculate_scores the user can apply the statistical models mentioned in *Lo --gwas-input-format JSON_EPIGEN # The input file path. Either specify a path to a .json file or path to a directory containing only one .json file. ---input-path +--gwas-input-file # Phenotype information of the dataset. NeEDL can deal with DICHOTOMOUS, CATEGORICAL and QUANTITATIVE phenotypes. --num-categories is only necessary for CATEGORICAL phenotypes with more than two categories. ---phenotype CATEGORICAL +--gwas-input-phenotype CATEGORICAL --num-categories 2 ``` diff --git a/src/data_model/SNPStorage.cpp b/src/data_model/SNPStorage.cpp index 79ab7d490..5a8cc63fe 100644 --- a/src/data_model/SNPStorage.cpp +++ b/src/data_model/SNPStorage.cpp @@ -392,4 +392,8 @@ namespace epi { return {}; } + void SNPStorage_WithoutGeno::shuffle_phenotypes() { + throw epi::Error("Shuffling phenotype data is not available in BIM-only SNPStorage."); + } + } // epi \ No newline at end of file diff --git a/src/data_model/SNPStorage.hpp b/src/data_model/SNPStorage.hpp index 7c6d544b0..5aac53007 100644 --- a/src/data_model/SNPStorage.hpp +++ b/src/data_model/SNPStorage.hpp @@ -116,6 +116,7 @@ namespace epi { virtual size_t num_categories() const = 0; virtual std::vector> get_individuals_per_category(const SNPSet & set) const = 0; virtual std::vector get_num_individuals_per_category() const = 0; + virtual void shuffle_phenotypes() = 0; SNPStorage(); @@ -157,6 +158,7 @@ namespace epi { size_t num_categories() const override; std::vector> get_individuals_per_category(const SNPSet & set) const override; std::vector get_num_individuals_per_category() const override; + void shuffle_phenotypes() override; private: std::vector snp_names; @@ -187,6 +189,7 @@ namespace epi { size_t num_categories() const override; std::vector> get_individuals_per_category(const SNPSet & set) const override; std::vector get_num_individuals_per_category() const override; + void shuffle_phenotypes() override; private: std::shared_ptr> instance; diff --git a/src/data_model/SNPStorage.ipp b/src/data_model/SNPStorage.ipp index 8450c6479..99a8bb323 100644 --- a/src/data_model/SNPStorage.ipp +++ b/src/data_model/SNPStorage.ipp @@ -285,6 +285,11 @@ namespace epi { return {}; } + template + void SNPStorage_PhenoType::shuffle_phenotypes() { + instance->shuffle_phenotypes(); + } + } diff --git a/src/data_model/SNP_t.cpp b/src/data_model/SNP_t.cpp index 8113507cb..2e4eb01ab 100644 --- a/src/data_model/SNP_t.cpp +++ b/src/data_model/SNP_t.cpp @@ -89,9 +89,11 @@ namespace epi { } SNPSet::SNPSet(const std::vector &snps) { + /* if (snps.size() > MAXIMUM_SNP_SET_SIZE) { throw epi::Error("Tried to create a snp-set with size > " + std::to_string(MAXIMUM_SNP_SET_SIZE)); } + */ this->snps = std::make_shared>(snps.begin(), snps.end()); std::sort(this->snps->begin(), this->snps->end()); } @@ -183,9 +185,11 @@ namespace epi { if (std::find(snps->begin(), snps->end(), snp) == snps->end()) { snps->insert(snps->end(), snp); + /* if (snps->size() > MAXIMUM_SNP_SET_SIZE) { throw epi::Error("Tried to create a snp-set with size > " + std::to_string(MAXIMUM_SNP_SET_SIZE)); } + */ std::sort(snps->begin(), snps->end()); scores = nullptr; @@ -213,11 +217,13 @@ namespace epi { snps->erase( unique( snps->begin(), snps->end() ), snps->end() ); if (snps->size() != size_before) { + /* if (snps->size() > MAXIMUM_SNP_SET_SIZE) { throw epi::Error("Tried to create a snp-set with size > " + std::to_string(MAXIMUM_SNP_SET_SIZE)); } scores = nullptr; scores_calculated = 0; + */ } return *this; diff --git a/src/jobs/InstanceLoader.cpp b/src/jobs/InstanceLoader.cpp index e55e8d05b..6d75c741d 100644 --- a/src/jobs/InstanceLoader.cpp +++ b/src/jobs/InstanceLoader.cpp @@ -53,9 +53,13 @@ namespace epi { TimeLogger logger("loading GWAS data"); Logger::logLine("input file: " + path); + std::uniform_int_distribution seed_gen (0, std::numeric_limits::max()); + size_t seed = seed_gen(data->random_device[omp_get_thread_num()]); + if (phenotype == options::PhenoType::QUANTITATIVE) { auto instance = std::make_shared>(); instance->load(inputFormat, path); + instance->set_seed(seed); auto storage = std::make_shared>(instance); auto snpStorage = std::static_pointer_cast(storage); data->snpStorage = snpStorage; @@ -63,6 +67,7 @@ namespace epi { } else if (phenotype == options::PhenoType::CATEGORICAL) { auto instance = std::make_shared>(num_categories); instance->load(inputFormat, path); + instance->set_seed(seed); auto storage = std::make_shared>(instance); auto snpStorage = std::static_pointer_cast(storage); data->snpStorage = snpStorage; diff --git a/src/jobs/ReadLINDENSets.cpp b/src/jobs/ReadLINDENSets.cpp index e49e6583f..5f1e7d006 100644 --- a/src/jobs/ReadLINDENSets.cpp +++ b/src/jobs/ReadLINDENSets.cpp @@ -7,7 +7,7 @@ #include "../util/helper_functions.hpp" namespace epi { - ReadLINDENSets::ReadLINDENSets(const std::string& path) { + ReadLINDENSets::ReadLINDENSets(const std::string& path, bool ignore_unknown_snps) { FileFinder ff; ff.add_ends_with(".reciprocalPairs"); this->path = find_file_by_ending(std::move(path), ff); @@ -33,6 +33,9 @@ namespace epi { } file.close(); + + + this->ignore_unknown_snps = ignore_unknown_snps; } void ReadLINDENSets::run(std::shared_ptr data) { @@ -44,8 +47,20 @@ namespace epi { const std::string& snp1_string = parser.cell(i, column1_index); const std::string& snp2_string = parser.cell(i, column2_index); SNPSet set; - set += data->snpStorage->by_name(snp1_string); - set += data->snpStorage->by_name(snp2_string); + + try { + set += data->snpStorage->by_name(snp1_string); + } catch (epi::SNPNotFoundError &err) { + if (!ignore_unknown_snps) throw err; + } + + try { + set += data->snpStorage->by_name(snp2_string); + } catch (epi::SNPNotFoundError &err) { + if (!ignore_unknown_snps) throw err; + } + + if (set.size() == 0) continue; set.set_attribute("LINDEN_rank", std::to_string(i)); diff --git a/src/jobs/ReadLINDENSets.hpp b/src/jobs/ReadLINDENSets.hpp index f318ef864..a7404bc8f 100644 --- a/src/jobs/ReadLINDENSets.hpp +++ b/src/jobs/ReadLINDENSets.hpp @@ -11,12 +11,13 @@ namespace epi { class ReadLINDENSets : public Job { public: - ReadLINDENSets(const std::string& path); + ReadLINDENSets(const std::string& path, bool ignore_unknown_snps = false); void run(std::shared_ptr data) override; rapidjson::Value getConfig(rapidjson::Document &doc) override; private: std::string path; size_t column1_index, column2_index; + bool ignore_unknown_snps; }; } // epi diff --git a/src/jobs/ReadMACOEDSets.cpp b/src/jobs/ReadMACOEDSets.cpp index 33dd1f1a5..5613d3112 100644 --- a/src/jobs/ReadMACOEDSets.cpp +++ b/src/jobs/ReadMACOEDSets.cpp @@ -7,12 +7,15 @@ #include "../util/helper_functions.hpp" namespace epi { - ReadMACOEDSets::ReadMACOEDSets(const std::string& path) { + ReadMACOEDSets::ReadMACOEDSets(const std::string& path, bool ignore_unknown_snps) { FileFinder ff; ff.add_ends_with(".txt"); this->path = find_file_by_ending(std::move(path), ff); - // do not do a column check here because these files are really fucked up + // do not perform a column check here because these files are really fucked up + + + this->ignore_unknown_snps = ignore_unknown_snps; } void ReadMACOEDSets::run(std::shared_ptr data) { @@ -30,8 +33,20 @@ namespace epi { const std::string& snp1_string = splits[1]; const std::string& snp2_string = splits[2]; SNPSet set; - set += data->snpStorage->by_name(snp1_string); - set += data->snpStorage->by_name(snp2_string); + + try { + set += data->snpStorage->by_name(snp1_string); + } catch (epi::SNPNotFoundError &err) { + if (!ignore_unknown_snps) throw err; + } + + try { + set += data->snpStorage->by_name(snp2_string); + } catch (epi::SNPNotFoundError &err) { + if (!ignore_unknown_snps) throw err; + } + + if (set.size() == 0) continue; data->snpSetStorage.push_back(set); } diff --git a/src/jobs/ReadMACOEDSets.hpp b/src/jobs/ReadMACOEDSets.hpp index aa7f2c721..0a952852b 100644 --- a/src/jobs/ReadMACOEDSets.hpp +++ b/src/jobs/ReadMACOEDSets.hpp @@ -11,11 +11,12 @@ namespace epi { class ReadMACOEDSets : public Job { public: - ReadMACOEDSets(const std::string& path); + ReadMACOEDSets(const std::string& path, bool ignore_unknown_snps = false); void run(std::shared_ptr data) override; rapidjson::Value getConfig(rapidjson::Document &doc) override; private: std::string path; + bool ignore_unknown_snps; }; } // epi diff --git a/src/jobs/ReadNeEDLSets.cpp b/src/jobs/ReadNeEDLSets.cpp index 631585c5c..7a3a0285f 100644 --- a/src/jobs/ReadNeEDLSets.cpp +++ b/src/jobs/ReadNeEDLSets.cpp @@ -7,7 +7,7 @@ #include "../util/helper_functions.hpp" namespace epi { - ReadNeEDLSets::ReadNeEDLSets(const std::string& path) { + ReadNeEDLSets::ReadNeEDLSets(const std::string& path, bool ignore_unknown_snps) { FileFinder ff; ff.add_ends_with(".csv"); this->path = find_file_by_ending(std::move(path), ff); @@ -30,6 +30,8 @@ namespace epi { } file.close(); + + this->ignore_unknown_snps = ignore_unknown_snps; } void ReadNeEDLSets::run(std::shared_ptr data) { @@ -53,15 +55,20 @@ namespace epi { data->snpSetStorage.clear(); for (size_t i = 1; i < parser.num_rows(); i++) { - const std::string& snp_string = parser.cell(i, column_index); + const std::string &snp_string = parser.cell(i, column_index); auto rs_ids = string_split(snp_string, ';'); SNPSet set; - for (auto & id : rs_ids) { - auto snp = data->snpStorage->by_name(id); - set += snp; + for (auto &id: rs_ids) { + try { + auto snp = data->snpStorage->by_name(id); + set += snp; + } catch (epi::SNPNotFoundError &err) { + if (!ignore_unknown_snps) throw err; + } } + if (set.size() == 0) continue; - for (auto & col : property_cols) { + for (auto &col: property_cols) { set.set_attribute(col.second, parser.cell(i, col.first)); } diff --git a/src/jobs/ReadNeEDLSets.hpp b/src/jobs/ReadNeEDLSets.hpp index 5a4f13809..0ac790965 100644 --- a/src/jobs/ReadNeEDLSets.hpp +++ b/src/jobs/ReadNeEDLSets.hpp @@ -11,12 +11,13 @@ namespace epi { class ReadNeEDLSets : public Job { public: - ReadNeEDLSets(const std::string& path); + ReadNeEDLSets(const std::string& path, bool ignore_unknown_snps = false); void run(std::shared_ptr data) override; rapidjson::Value getConfig(rapidjson::Document &doc) override; private: std::string path; size_t column_index; + bool ignore_unknown_snps; }; } // epi diff --git a/src/jobs/ShufflePhenotypes.cpp b/src/jobs/ShufflePhenotypes.cpp new file mode 100644 index 000000000..8ae424637 --- /dev/null +++ b/src/jobs/ShufflePhenotypes.cpp @@ -0,0 +1,21 @@ +// +// Created by juli on 22.09.23. +// + +#include "ShufflePhenotypes.hpp" +#include "../util/TimeLogger.hpp" + +namespace epi { + void ShufflePhenotypes::run(std::shared_ptr data) { + if (data->snpStorage == nullptr) { + throw epi::Error("Cannot shuffle phenotypes because no instance was loaded yet."); + return; + } + + TimeLogger logger("shuffling phenotypes"); + + data->snpStorage->shuffle_phenotypes(); + + logger.stop(); + } +} // epi \ No newline at end of file diff --git a/src/jobs/ShufflePhenotypes.hpp b/src/jobs/ShufflePhenotypes.hpp new file mode 100644 index 000000000..86dd95f31 --- /dev/null +++ b/src/jobs/ShufflePhenotypes.hpp @@ -0,0 +1,23 @@ +// +// Created by juli on 22.09.23. +// + +#ifndef NEEDL_SHUFFLEPHENOTYPES_HPP +#define NEEDL_SHUFFLEPHENOTYPES_HPP + +#include "Job.hpp" + +namespace epi { + + class ShufflePhenotypes : public Job { + public: + void run(std::shared_ptr data) override; + }; + +} // epi + +#ifdef HEADER_ONLY +#include "ShufflePhenotypes.cpp" +#endif + +#endif //NEEDL_SHUFFLEPHENOTYPES_HPP diff --git a/src/pipelines/HelperPipelines.cpp b/src/pipelines/HelperPipelines.cpp index 6c27a6cdc..a02773969 100644 --- a/src/pipelines/HelperPipelines.cpp +++ b/src/pipelines/HelperPipelines.cpp @@ -9,6 +9,7 @@ #include "../jobs/WriteSets.hpp" #include "../jobs/CreateRandomSets.hpp" #include "../jobs/CreateSetKMers.hpp" +#include "../jobs/ShufflePhenotypes.hpp" void @@ -38,6 +39,7 @@ void HelperPipelines::calculate_scores( std::string rank_model, std::string out_path, std::vector> snp_annotation_pipeline, + bool shuffle_phenotypes, int num_threads, bool random_sets, size_t num_random_sets, @@ -45,6 +47,7 @@ void HelperPipelines::calculate_scores( size_t k_min, size_t k_max ) { + if (num_threads < 0) { throw epi::Error("Invalid thread number provided."); } @@ -65,6 +68,10 @@ void HelperPipelines::calculate_scores( ) }}; + if (shuffle_phenotypes) { + seq.add(std::make_shared()); + } + seq.add(snp_annotation_pipeline); seq.add(snp_reader); diff --git a/src/pipelines/HelperPipelines.hpp b/src/pipelines/HelperPipelines.hpp index e0606b499..79b0a9f36 100644 --- a/src/pipelines/HelperPipelines.hpp +++ b/src/pipelines/HelperPipelines.hpp @@ -30,6 +30,7 @@ class HelperPipelines { std::string rank_model, std::string out_path, std::vector> snp_annotation_pipeline, + bool shuffle_phenotypes, int num_threads = 0, bool random_sets = false, size_t num_random_sets = 1000, diff --git a/test/model/src/NeEDL.cpp b/test/model/src/NeEDL.cpp index 0760a9763..800be5527 100644 --- a/test/model/src/NeEDL.cpp +++ b/test/model/src/NeEDL.cpp @@ -3,6 +3,7 @@ // #ifdef CMAKE_RELEASE +#pragma message ("CMAKE_RELEASE set") #define HEADER_ONLY #endif @@ -10,7 +11,10 @@ #include #include "../../src/pipelines/NeEDLPipeline.hpp" - +#include "../../src/util/helper_functions.hpp" +#include "../../../src/jobs/SeedingRandomConnected.hpp" +#include "../../../src/jobs/SeedingCommunityWise.hpp" +#include "../../../src/jobs/SeedingQuantumComputing.hpp" diff --git a/test/model/src/calculate_scores.cpp b/test/model/src/calculate_scores.cpp index 17fe9b89e..84ed14857 100644 --- a/test/model/src/calculate_scores.cpp +++ b/test/model/src/calculate_scores.cpp @@ -72,6 +72,13 @@ int main(int argc, char **argv) { std::string k_mers; app.add_option("--k-mers", k_mers, "Give a number (eg. 4) or a range (eg. 2-4) for k to generate scores of all k-mers."); + bool ignore_unknown_snps = false; + app.add_flag("--ignore-unknown-snps", ignore_unknown_snps, "If this flag is set, all SNP sets that contain a SNP which is not present in the selected dataset are ignored."); + + bool shuffle_phenotypes = false; + app.add_flag("--shuffle-phenotypes", shuffle_phenotypes, "If this flag is set, the individuals' phenotypes are shuffled prior to score calculation."); + + // Parse the options. @@ -80,11 +87,11 @@ int main(int argc, char **argv) { snp_sets_input_type = toUpperCase(snp_sets_input_type); std::shared_ptr reader; if (snp_sets_input_type == "NEEDL") { - reader = std::make_shared(snp_sets_input_file); + reader = std::make_shared(snp_sets_input_file, ignore_unknown_snps); } else if (snp_sets_input_type == "MACOED") { - reader = std::make_shared(snp_sets_input_file); + reader = std::make_shared(snp_sets_input_file, ignore_unknown_snps); } else if (snp_sets_input_type == "LINDEN") { - reader = std::make_shared(snp_sets_input_file); + reader = std::make_shared(snp_sets_input_file, ignore_unknown_snps); } else { throw epi::Error("Unknown SNP-sets input type: " + snp_sets_input_type); } @@ -126,6 +133,7 @@ int main(int argc, char **argv) { rank_model, snp_sets_output_file, snpAnnotationPipeline, + shuffle_phenotypes, num_threads, create_random_sets, num_random_sets,