Skip to content

Commit

Permalink
Merge pull request #13 from biomedbigdata/phe-shuffle-snp-filtering
Browse files Browse the repository at this point in the history
calculate_scores: implemented phenotype shuffling, silent SNP filtering
  • Loading branch information
juli-p authored Sep 25, 2023
2 parents 19ba413 + b107b60 commit db914d9
Show file tree
Hide file tree
Showing 18 changed files with 149 additions and 22 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -650,10 +650,10 @@ With calculate_scores the user can apply the statistical models mentioned in *Lo
--gwas-input-format JSON_EPIGEN

# The input file path. Either specify a path to a .json file or path to a directory containing only one .json file.
--input-path <path>
--gwas-input-file <path>

# Phenotype information of the dataset. NeEDL can deal with DICHOTOMOUS, CATEGORICAL and QUANTITATIVE phenotypes. --num-categories is only necessary for CATEGORICAL phenotypes with more than two categories.
--phenotype CATEGORICAL
--gwas-input-phenotype CATEGORICAL
--num-categories 2
```

Expand Down
4 changes: 4 additions & 0 deletions src/data_model/SNPStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,4 +392,8 @@ namespace epi {
return {};
}

void SNPStorage_WithoutGeno::shuffle_phenotypes() {
throw epi::Error("Shuffling phenotype data is not available in BIM-only SNPStorage.");
}

} // epi
3 changes: 3 additions & 0 deletions src/data_model/SNPStorage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ namespace epi {
virtual size_t num_categories() const = 0;
virtual std::vector<std::vector<size_t>> get_individuals_per_category(const SNPSet & set) const = 0;
virtual std::vector<size_t> get_num_individuals_per_category() const = 0;
virtual void shuffle_phenotypes() = 0;


SNPStorage();
Expand Down Expand Up @@ -157,6 +158,7 @@ namespace epi {
size_t num_categories() const override;
std::vector<std::vector<size_t>> get_individuals_per_category(const SNPSet & set) const override;
std::vector<size_t> get_num_individuals_per_category() const override;
void shuffle_phenotypes() override;

private:
std::vector<std::string> snp_names;
Expand Down Expand Up @@ -187,6 +189,7 @@ namespace epi {
size_t num_categories() const override;
std::vector<std::vector<size_t>> get_individuals_per_category(const SNPSet & set) const override;
std::vector<size_t> get_num_individuals_per_category() const override;
void shuffle_phenotypes() override;

private:
std::shared_ptr<epi::Instance<PhenoType>> instance;
Expand Down
5 changes: 5 additions & 0 deletions src/data_model/SNPStorage.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,11 @@ namespace epi {
return {};
}

template<class PhenoType>
void SNPStorage_PhenoType<PhenoType>::shuffle_phenotypes() {
instance->shuffle_phenotypes();
}


}

Expand Down
6 changes: 6 additions & 0 deletions src/data_model/SNP_t.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,11 @@ namespace epi {
}

SNPSet::SNPSet(const std::vector<SNP_t> &snps) {
/*
if (snps.size() > MAXIMUM_SNP_SET_SIZE) {
throw epi::Error("Tried to create a snp-set with size > " + std::to_string(MAXIMUM_SNP_SET_SIZE));
}
*/
this->snps = std::make_shared<std::vector<SNP_t>>(snps.begin(), snps.end());
std::sort(this->snps->begin(), this->snps->end());
}
Expand Down Expand Up @@ -183,9 +185,11 @@ namespace epi {
if (std::find(snps->begin(), snps->end(), snp) == snps->end()) {
snps->insert(snps->end(), snp);

/*
if (snps->size() > MAXIMUM_SNP_SET_SIZE) {
throw epi::Error("Tried to create a snp-set with size > " + std::to_string(MAXIMUM_SNP_SET_SIZE));
}
*/

std::sort(snps->begin(), snps->end());
scores = nullptr;
Expand Down Expand Up @@ -213,11 +217,13 @@ namespace epi {
snps->erase( unique( snps->begin(), snps->end() ), snps->end() );

if (snps->size() != size_before) {
/*
if (snps->size() > MAXIMUM_SNP_SET_SIZE) {
throw epi::Error("Tried to create a snp-set with size > " + std::to_string(MAXIMUM_SNP_SET_SIZE));
}
scores = nullptr;
scores_calculated = 0;
*/
}

return *this;
Expand Down
5 changes: 5 additions & 0 deletions src/jobs/InstanceLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,21 @@ namespace epi {
TimeLogger logger("loading GWAS data");
Logger::logLine("input file: " + path);

std::uniform_int_distribution<size_t> seed_gen (0, std::numeric_limits<size_t>::max());
size_t seed = seed_gen(data->random_device[omp_get_thread_num()]);

if (phenotype == options::PhenoType::QUANTITATIVE) {
auto instance = std::make_shared<Instance<epi::QuantitativePhenoType>>();
instance->load(inputFormat, path);
instance->set_seed(seed);
auto storage = std::make_shared<SNPStorage_PhenoType<epi::QuantitativePhenoType>>(instance);
auto snpStorage = std::static_pointer_cast<SNPStorage>(storage);
data->snpStorage = snpStorage;
SNPStorage::currentSnpStorage = snpStorage;
} else if (phenotype == options::PhenoType::CATEGORICAL) {
auto instance = std::make_shared<Instance<epi::CategoricalPhenoType>>(num_categories);
instance->load(inputFormat, path);
instance->set_seed(seed);
auto storage = std::make_shared<SNPStorage_PhenoType<epi::CategoricalPhenoType>>(instance);
auto snpStorage = std::static_pointer_cast<SNPStorage>(storage);
data->snpStorage = snpStorage;
Expand Down
21 changes: 18 additions & 3 deletions src/jobs/ReadLINDENSets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "../util/helper_functions.hpp"

namespace epi {
ReadLINDENSets::ReadLINDENSets(const std::string& path) {
ReadLINDENSets::ReadLINDENSets(const std::string& path, bool ignore_unknown_snps) {
FileFinder ff;
ff.add_ends_with(".reciprocalPairs");
this->path = find_file_by_ending(std::move(path), ff);
Expand All @@ -33,6 +33,9 @@ namespace epi {
}

file.close();


this->ignore_unknown_snps = ignore_unknown_snps;
}

void ReadLINDENSets::run(std::shared_ptr<DataModel> data) {
Expand All @@ -44,8 +47,20 @@ namespace epi {
const std::string& snp1_string = parser.cell(i, column1_index);
const std::string& snp2_string = parser.cell(i, column2_index);
SNPSet set;
set += data->snpStorage->by_name(snp1_string);
set += data->snpStorage->by_name(snp2_string);

try {
set += data->snpStorage->by_name(snp1_string);
} catch (epi::SNPNotFoundError &err) {
if (!ignore_unknown_snps) throw err;
}

try {
set += data->snpStorage->by_name(snp2_string);
} catch (epi::SNPNotFoundError &err) {
if (!ignore_unknown_snps) throw err;
}

if (set.size() == 0) continue;

set.set_attribute("LINDEN_rank", std::to_string(i));

Expand Down
3 changes: 2 additions & 1 deletion src/jobs/ReadLINDENSets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ namespace epi {

class ReadLINDENSets : public Job {
public:
ReadLINDENSets(const std::string& path);
ReadLINDENSets(const std::string& path, bool ignore_unknown_snps = false);
void run(std::shared_ptr<DataModel> data) override;
rapidjson::Value getConfig(rapidjson::Document &doc) override;
private:
std::string path;
size_t column1_index, column2_index;
bool ignore_unknown_snps;
};

} // epi
Expand Down
23 changes: 19 additions & 4 deletions src/jobs/ReadMACOEDSets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@
#include "../util/helper_functions.hpp"

namespace epi {
ReadMACOEDSets::ReadMACOEDSets(const std::string& path) {
ReadMACOEDSets::ReadMACOEDSets(const std::string& path, bool ignore_unknown_snps) {
FileFinder ff;
ff.add_ends_with(".txt");
this->path = find_file_by_ending(std::move(path), ff);

// do not do a column check here because these files are really fucked up
// do not perform a column check here because these files are really fucked up


this->ignore_unknown_snps = ignore_unknown_snps;
}

void ReadMACOEDSets::run(std::shared_ptr<DataModel> data) {
Expand All @@ -30,8 +33,20 @@ namespace epi {
const std::string& snp1_string = splits[1];
const std::string& snp2_string = splits[2];
SNPSet set;
set += data->snpStorage->by_name(snp1_string);
set += data->snpStorage->by_name(snp2_string);

try {
set += data->snpStorage->by_name(snp1_string);
} catch (epi::SNPNotFoundError &err) {
if (!ignore_unknown_snps) throw err;
}

try {
set += data->snpStorage->by_name(snp2_string);
} catch (epi::SNPNotFoundError &err) {
if (!ignore_unknown_snps) throw err;
}

if (set.size() == 0) continue;

data->snpSetStorage.push_back(set);
}
Expand Down
3 changes: 2 additions & 1 deletion src/jobs/ReadMACOEDSets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ namespace epi {

class ReadMACOEDSets : public Job {
public:
ReadMACOEDSets(const std::string& path);
ReadMACOEDSets(const std::string& path, bool ignore_unknown_snps = false);
void run(std::shared_ptr<DataModel> data) override;
rapidjson::Value getConfig(rapidjson::Document &doc) override;
private:
std::string path;
bool ignore_unknown_snps;
};

} // epi
Expand Down
19 changes: 13 additions & 6 deletions src/jobs/ReadNeEDLSets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "../util/helper_functions.hpp"

namespace epi {
ReadNeEDLSets::ReadNeEDLSets(const std::string& path) {
ReadNeEDLSets::ReadNeEDLSets(const std::string& path, bool ignore_unknown_snps) {
FileFinder ff;
ff.add_ends_with(".csv");
this->path = find_file_by_ending(std::move(path), ff);
Expand All @@ -30,6 +30,8 @@ namespace epi {
}

file.close();

this->ignore_unknown_snps = ignore_unknown_snps;
}

void ReadNeEDLSets::run(std::shared_ptr<DataModel> data) {
Expand All @@ -53,15 +55,20 @@ namespace epi {

data->snpSetStorage.clear();
for (size_t i = 1; i < parser.num_rows(); i++) {
const std::string& snp_string = parser.cell(i, column_index);
const std::string &snp_string = parser.cell(i, column_index);
auto rs_ids = string_split(snp_string, ';');
SNPSet set;
for (auto & id : rs_ids) {
auto snp = data->snpStorage->by_name(id);
set += snp;
for (auto &id: rs_ids) {
try {
auto snp = data->snpStorage->by_name(id);
set += snp;
} catch (epi::SNPNotFoundError &err) {
if (!ignore_unknown_snps) throw err;
}
}
if (set.size() == 0) continue;

for (auto & col : property_cols) {
for (auto &col: property_cols) {
set.set_attribute(col.second, parser.cell(i, col.first));
}

Expand Down
3 changes: 2 additions & 1 deletion src/jobs/ReadNeEDLSets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ namespace epi {

class ReadNeEDLSets : public Job {
public:
ReadNeEDLSets(const std::string& path);
ReadNeEDLSets(const std::string& path, bool ignore_unknown_snps = false);
void run(std::shared_ptr<DataModel> data) override;
rapidjson::Value getConfig(rapidjson::Document &doc) override;
private:
std::string path;
size_t column_index;
bool ignore_unknown_snps;
};

} // epi
Expand Down
21 changes: 21 additions & 0 deletions src/jobs/ShufflePhenotypes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//
// Created by juli on 22.09.23.
//

#include "ShufflePhenotypes.hpp"
#include "../util/TimeLogger.hpp"

namespace epi {
void ShufflePhenotypes::run(std::shared_ptr<DataModel> data) {
if (data->snpStorage == nullptr) {
throw epi::Error("Cannot shuffle phenotypes because no instance was loaded yet.");
return;
}

TimeLogger logger("shuffling phenotypes");

data->snpStorage->shuffle_phenotypes();

logger.stop();
}
} // epi
23 changes: 23 additions & 0 deletions src/jobs/ShufflePhenotypes.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//
// Created by juli on 22.09.23.
//

#ifndef NEEDL_SHUFFLEPHENOTYPES_HPP
#define NEEDL_SHUFFLEPHENOTYPES_HPP

#include "Job.hpp"

namespace epi {

class ShufflePhenotypes : public Job {
public:
void run(std::shared_ptr<DataModel> data) override;
};

} // epi

#ifdef HEADER_ONLY
#include "ShufflePhenotypes.cpp"
#endif

#endif //NEEDL_SHUFFLEPHENOTYPES_HPP
7 changes: 7 additions & 0 deletions src/pipelines/HelperPipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "../jobs/WriteSets.hpp"
#include "../jobs/CreateRandomSets.hpp"
#include "../jobs/CreateSetKMers.hpp"
#include "../jobs/ShufflePhenotypes.hpp"


void
Expand Down Expand Up @@ -38,13 +39,15 @@ void HelperPipelines::calculate_scores(
std::string rank_model,
std::string out_path,
std::vector<std::shared_ptr<epi::Job>> snp_annotation_pipeline,
bool shuffle_phenotypes,
int num_threads,
bool random_sets,
size_t num_random_sets,
bool create_k_mers,
size_t k_min,
size_t k_max
) {

if (num_threads < 0) {
throw epi::Error("Invalid thread number provided.");
}
Expand All @@ -65,6 +68,10 @@ void HelperPipelines::calculate_scores(
)
}};

if (shuffle_phenotypes) {
seq.add(std::make_shared<epi::ShufflePhenotypes>());
}

seq.add(snp_annotation_pipeline);
seq.add(snp_reader);

Expand Down
1 change: 1 addition & 0 deletions src/pipelines/HelperPipelines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class HelperPipelines {
std::string rank_model,
std::string out_path,
std::vector<std::shared_ptr<epi::Job>> snp_annotation_pipeline,
bool shuffle_phenotypes,
int num_threads = 0,
bool random_sets = false,
size_t num_random_sets = 1000,
Expand Down
6 changes: 5 additions & 1 deletion test/model/src/NeEDL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@
//

#ifdef CMAKE_RELEASE
#pragma message ("CMAKE_RELEASE set")
#define HEADER_ONLY
#endif

#include <CLI11.hpp>
#include <omp.h>

#include "../../src/pipelines/NeEDLPipeline.hpp"

#include "../../src/util/helper_functions.hpp"
#include "../../../src/jobs/SeedingRandomConnected.hpp"
#include "../../../src/jobs/SeedingCommunityWise.hpp"
#include "../../../src/jobs/SeedingQuantumComputing.hpp"



Expand Down
Loading

0 comments on commit db914d9

Please sign in to comment.