Skip to content

Commit

Permalink
Merge pull request #26 from biomedbigdata/eqtl_mapping
Browse files Browse the repository at this point in the history
Eqtl mapping
  • Loading branch information
juli-p authored Feb 22, 2024
2 parents 6da8f14 + 8de4727 commit 7cecdbd
Show file tree
Hide file tree
Showing 13 changed files with 195 additions and 69 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ test/model/res/*
# unzipped GRNs
/data/GRN/*.csv

# unzipped eqtl_mappings
data/eqtl_mapping/*.csv

#dbSNP
/data/dbSNP/no_pseudogenes/snps_restruc_full.csv
/data/dbSNP/no_pseudogenes/snps_cleaned_part1.csv
Expand Down
Binary file added data/eqtl_mapping/eqtl_mapping.csv.zip
Binary file not shown.
89 changes: 89 additions & 0 deletions src/jobs/EQTLAnnotator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
//
// Created by juli on 28.01.24.
//

#include "EQTLAnnotator.hpp"
#include "../util/FileFinder.hpp"
#include "../util/helper_functions.hpp"

namespace epi {
EQTLAnnotator::EQTLAnnotator(const std::vector<std::string> &tissue_types, double pvalue_cutoff, bool bh_correction, const std::string &data_directory)
: SnpCsvAnnotator(true, '\t', -1, -1) {
this->tissue_types = { tissue_types.begin(), tissue_types.end() };
this-> pvalue_cutoff = pvalue_cutoff;
this->bh_correction = bh_correction;

FileFinder ff;
ff.add_ends_with(".csv");
ff.add_contains("eqtl_mapping");
path = find_file_by_ending(data_directory + "eqtl_mapping/", ff);

check_columns(0, 1);

if (!this->tissue_types.empty()) {
Logger::logLine("Checking if given tissue types for eQTL mapping are valid.");
// check tissue types and num columns
CSVParser parser;
parser.parse(path, '\t');
if (parser.num_columns() != 4) throw epi::Error("EQTL annotation file does not contain exactly 4 columns");

std::unordered_set<std::string> tissues;
for (size_t i = 1; i < parser.num_rows(); ++i) {
tissues.insert(parser.cell(i, 3));
}
bool check_failed = false;
for (const auto &tissue: this->tissue_types) {
if (!tissues.contains(tissue)) {
Logger::logLine("tissue " + tissue + " does not exist in eQTL dataset.");
check_failed = true;
}
}

if (check_failed) {
Logger::logLine("Allowed tissues are: ");
for (const auto &tissue: tissues) {
Logger::logLine(" " + tissue);
}
throw epi::Error(
"Some tissue types were selected for eQTL annotation that do not exist in the dataset.");
}
}
}

std::vector<bool> EQTLAnnotator::filter_entries(const CSVParser &parser) {
if (parser.num_columns() != 4) throw epi::Error("EQTL annotation file does not contain exactly 4 columns");

Logger::logLine("Filter eQTL entries based on selected tissues.");

// filter by tissue
std::vector<size_t> indices;
std::vector<double> pvalues;
for (size_t i = 1; i < parser.num_rows(); ++i) {
if (tissue_types.empty() || tissue_types.contains(parser.cell(i, 3))) {
indices.push_back(i);
pvalues.push_back(std::stod(parser.cell(i, 2)));
}
}

// do bh filtering
if (bh_correction) {
Logger::logLine("Apply Benjamini-Hochberg correction to p-values of eQTL entries");
benjamini_hochberg_correction(pvalues);
}

// create filter mask
Logger::logLine("Filter eQTL entries with p-value cutoff " + Logger::to_string(pvalue_cutoff));
std::vector<bool> result (parser.num_rows(), false);
size_t num_selected = 0;
for (size_t i = 0; i < indices.size(); ++i) {
if (pvalues[i] < pvalue_cutoff) {
result[indices[i]] = true;
++num_selected;
}
}
Logger::logLine("selected " + Logger::to_string(num_selected) + " of " + Logger::to_string(indices.size()) + " eQTL entries");

return result;
}

} // epi
31 changes: 31 additions & 0 deletions src/jobs/EQTLAnnotator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
//
// Created by juli on 28.01.24.
//

#ifndef NEEDL_EQTLANNOTATOR_HPP
#define NEEDL_EQTLANNOTATOR_HPP

#include "SnpCsvAnnotator.hpp"

namespace epi {

class EQTLAnnotator : public SnpCsvAnnotator {
public:
explicit EQTLAnnotator(const std::vector<std::string>& tissue_types, double pvalue_cutoff = 0.05, bool bh_correction = true, const std::string& data_directory = "data/");

private:
std::vector<bool> filter_entries(const epi::CSVParser &parser) override;

std::unordered_set<std::string> tissue_types;
double pvalue_cutoff;
bool bh_correction;
};

} // epi


#ifdef HEADER_ONLY
#include "EQTLAnnotator.cpp"
#endif

#endif //NEEDL_EQTLANNOTATOR_HPP
5 changes: 5 additions & 0 deletions src/jobs/MMAFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "MMAFilter.hpp"
#include "../util/TimeLogger.hpp"
#include "../util/helper_functions.hpp"

namespace epi {
MMAFilter::MMAFilter(double filter_cutoff, bool use_BH_correction) {
Expand All @@ -30,6 +31,9 @@ namespace epi {
// correct p values
Logger::logLine("Correct MMA p-values with Benjamini-Hochberg");

benjamini_hochberg_correction(mma_scores);

/*
// clone vector with mma values
std::vector<std::pair<double, size_t>> mma_values_copy;
for(size_t i = 0; i < mma_scores.size(); i++) {
Expand Down Expand Up @@ -59,6 +63,7 @@ namespace epi {
// std::cout << maximum_marginal_association_for_snps_[x.second] << " -> " << x.first << std::endl;
mma_scores[x.second] = x.first;
}
*/
}

// store mma_scores and mark as removed if threshold is not reached
Expand Down
1 change: 1 addition & 0 deletions src/jobs/PlinkShufflePhenotype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "PlinkShufflePhenotype.hpp"
#include "../util/TimeLogger.hpp"
#include <filesystem>

namespace epi {

Expand Down
14 changes: 12 additions & 2 deletions src/jobs/SnpCsvAnnotator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,21 @@ namespace epi {
CSVParser parser;
parser.parse(path, csv_separator);

const auto filter_map = filter_entries(parser);

std::vector<std::vector<std::string>> snp_splits (parser.num_rows()), annotation_splits(parser.num_rows());
if (snp_separator != -1) {
#pragma omp parallel for default(none) shared(has_header, snp_column, snp_separator, parser, snp_splits)
#pragma omp parallel for default(none) shared(has_header, snp_column, snp_separator, parser, snp_splits, filter_map)
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
if (!filter_map[i]) continue;
snp_splits[i] = string_split(parser.cell(i, snp_column), snp_separator);
}
}

if (annotation_separator != -1) {
#pragma omp parallel for default(none) shared(has_header, annotation_column, annotation_separator, parser, annotation_splits)
#pragma omp parallel for default(none) shared(has_header, annotation_column, annotation_separator, parser, annotation_splits, filter_map)
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
if (!filter_map[i]) continue;
annotation_splits[i] = string_split(parser.cell(i, annotation_column), annotation_separator);
}
}
Expand All @@ -108,6 +112,7 @@ namespace epi {

std::vector<std::string> snps, annotations;
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
if (!filter_map[i]) continue;
if (snp_separator == -1) {
snps.clear();
snps.push_back(parser.cell(i, snp_column));
Expand Down Expand Up @@ -233,5 +238,10 @@ namespace epi {
}
}

std::vector<bool> SnpCsvAnnotator::filter_entries(const CSVParser &parser) {
// accept all
return std::vector<bool>(parser.num_rows(), true);
}


} // epi
2 changes: 2 additions & 0 deletions src/jobs/SnpCsvAnnotator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ namespace epi {
protected:
explicit SnpCsvAnnotator(bool has_header = false, char csv_separator = ',', char snp_separator = -1, char annotation_separator = -1);

virtual std::vector<bool> filter_entries(const CSVParser &parser);

void check_columns(size_t snp_col, size_t annotation_col);
void check_columns(const std::string& snp_col, const std::string& annotation_col);

Expand Down
7 changes: 7 additions & 0 deletions src/pipelines/NeEDLPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "../jobs/MultiNetworkAggregator.hpp"
#include "../jobs/SnpCsvAnnotator.hpp"
#include "../jobs/ShinyAppLauncher.hpp"
#include "../jobs/EQTLAnnotator.hpp"


NeEDLPipeline::NeEDLPipeline(std::string input_path, std::string input_format, std::string input_phenotype,
Expand Down Expand Up @@ -112,6 +113,10 @@ void NeEDLPipeline::add_snp_annotation_source_dbSNP() {
snpAnnotationPipeline.push_back(std::make_shared<epi::DbSNPAnnotator>(data_directory));
}

void NeEDLPipeline::add_snp_annotation_source_eQTL(const std::vector<std::string>& tissue_selection, double pvalue_cutoff, bool bh_correction) {
snpAnnotationPipeline.push_back(std::make_shared<epi::EQTLAnnotator>(tissue_selection, pvalue_cutoff, bh_correction, data_directory));
}

void NeEDLPipeline::add_snp_annotation_source(const epi::SnpCsvAnnotator& annotation_source) {
snpAnnotationPipeline.push_back(std::make_shared<epi::SnpCsvAnnotator>(annotation_source));
}
Expand Down Expand Up @@ -157,3 +162,5 @@ void NeEDLPipeline::activate_MMA_filter(double cutoff, bool correct_BH) {
mmaFilter = std::make_shared<epi::MMAFilter>(cutoff, correct_BH);
}



1 change: 1 addition & 0 deletions src/pipelines/NeEDLPipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class NeEDLPipeline {

// snp annotation
void add_snp_annotation_source_dbSNP();
void add_snp_annotation_source_eQTL(const std::vector<std::string>& tissue_selection, double pvalue_cutoff, bool bh_correction);
void add_snp_annotation_source(const epi::SnpCsvAnnotator& annotation_source);

// networks to search (eg. BIOGRID)
Expand Down
33 changes: 33 additions & 0 deletions src/util/helper_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,37 @@ namespace epi {
return res_minutes;
}

void benjamini_hochberg_correction(std::vector<double> &pvalues) {
if (pvalues.empty()) return;

// convert to pvalue-index pairs
std::vector<std::pair<double, size_t>> pval_index_pairs;
for(size_t i = 0; i < pvalues.size(); i++) {
pval_index_pairs.push_back({ pvalues[i], i });
}

// sort pairs in ascending order by p value
std::sort(pval_index_pairs.begin(), pval_index_pairs.end(), [] (std::pair<double, size_t> a, std::pair<double, size_t> b) { return a.first < b.first; });

// correct p-value
for (size_t i = 0; i < pval_index_pairs.size(); i++) {
pval_index_pairs[i].first *= double(pval_index_pairs.size()) / double(i + 1);
}

// check that lower p-value cannot lead to higher corrected one comparing the others
double min = pval_index_pairs.back().first;
for (long i = pval_index_pairs.size() - 1; i >= 0; i--) {
if (pval_index_pairs[i].first > min) {
pval_index_pairs[i].first = min;
} else {
min = pval_index_pairs[i].first;
}
}

// sort back adjusted p-values
for (auto x : pval_index_pairs) {
pvalues[x.second] = x.first;
}
}

} // epi
68 changes: 1 addition & 67 deletions src/util/helper_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,74 +86,8 @@ namespace epi {
return str;
}

/*
// from https://www.intel.com/content/www/us/en/developer/articles/technical/an-efficient-parallel-three-way-quicksort-using-intel-c-compiler-and-openmp-45-library.html
template<class RanIt>
void qsort3w(RanIt _First, RanIt _Last)
{
if (_First >= _Last) return;
std::size_t _Size = 0L;
if ((_Size = std::distance(_First, _Last)) > 0)
{
RanIt _LeftIt = _First, _RightIt = _Last;
bool is_swapped_left = false, is_swapped_right = false;
typename std::iterator_traits<RanIt>::value_type _Pivot = *_First;
RanIt _FwdIt = _First + 1;
while (_FwdIt <= _RightIt)
{
if (*_FwdIt < _Pivot)
{
is_swapped_left = true;
std::iter_swap(_LeftIt, _FwdIt);
_LeftIt++; _FwdIt++;
}
else if (_Pivot < *_FwdIt) {
is_swapped_right = true;
std::iter_swap(_RightIt, _FwdIt);
_RightIt--;
}
else _FwdIt++;
}
if (_Size >= 10000L)
{
#pragma omp taskgroup
{
#pragma omp task untied mergeable
if ((std::distance(_First, _LeftIt) > 0) && (is_swapped_left))
qsort3w(_First, _LeftIt - 1);
#pragma omp task untied mergeable
if ((std::distance(_RightIt, _Last) > 0) && (is_swapped_right))
qsort3w(_RightIt + 1, _Last);
}
}
else
{
#pragma omp task untied mergeable
{
if ((std::distance(_First, _LeftIt) > 0) && is_swapped_left)
qsort3w(_First, _LeftIt - 1);
void benjamini_hochberg_correction(std::vector<double> &pvalues);

if ((std::distance(_RightIt, _Last) > 0) && is_swapped_right)
qsort3w(_RightIt + 1, _Last);
}
}
}
}
template<class BidirIt>
void parallel_sort(BidirIt _First, BidirIt _Last)
{
#pragma omp master
qsort3w(_First, _Last - 1);
}
*/

std::unordered_map<std::string, char> getEscapedCharMap ();

Expand Down
10 changes: 10 additions & 0 deletions test/model/src/NeEDL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@ int main(int argc, char** argv) {
std::vector<std::string> snp_annotation_methods;
app.add_option("--snp-annotate", snp_annotation_methods, "Declare annotation sources with this option. Option can be used multiple times to add multiple sources. The file must be a csv file with one column for the snp names (as in the input file) and one column for the annotations. Both columns can be multi-columns that contain multiple entries per row separated with a separating character. If a column is not a multi-column, give -1 as separator. If both column descriptors can be casted to a number this is used as a null-based index of the column Otherwise, the two values are searched for in the header line. The format of this option is <path>|<has-header ? 'yes' : 'no'>|<snp-col>|<annotation-col>|[<csv-separator>]|[<snp-separator>]|[<annotation-separator>].");

bool use_eqtl_mapping = false;
app.add_flag("--snp-annotate-eQTL", use_eqtl_mapping, "This option annotates the SNPs with the help of the included eQTL database curated from the eQTL catalogue. This results in a gene annotation.");

std::vector<std::string> eqtl_tissues;
app.add_option("--eQTL-tissue", eqtl_tissues, "This parameter can only be used together with --snp-annotate-eQTL. With this parameter, one can restrict the use of eQTL data to certain tissues. Specify this parameter multiple times to select multiple tissues. If this parameter is not specified, all available tissues are used.");

double eqtl_pvalue_cutoff = 0.05;
app.add_option("--eQTL-pvalue-cutoff", eqtl_tissues, "This parameter can only be used together with --snp-annotate-eQTL. This cutoff value is used to filter the entries in eQTL catalogue. The pvalues of all selected tissues are corrected with Benjamini-Hochberg before applying this cutoff. DEFAUL: 0.05");


// multi-network networks
bool use_BIOGRID = false;
Expand Down Expand Up @@ -573,6 +582,7 @@ int main(int argc, char** argv) {
if (!network_shuffle_method.empty()) pipeline.activate_network_shuffling(network_shuffle_method);

if (use_dbSNP) pipeline.add_snp_annotation_source_dbSNP();
if (use_eqtl_mapping) pipeline.add_snp_annotation_source_eQTL(eqtl_tissues, eqtl_pvalue_cutoff, true);
for (auto &method: snp_annotation_methods) {
pipeline.add_snp_annotation_source(epi::SnpCsvAnnotator::parse_from_source_string(method));
}
Expand Down

0 comments on commit 7cecdbd

Please sign in to comment.