Skip to content

Commit

Permalink
Applies --min-recom to reference sites instead of typed sites.
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathonl committed Nov 9, 2023
1 parent 498ea85 commit 42bead8
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 102 deletions.
127 changes: 41 additions & 86 deletions src/input_prep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,10 @@ bool load_reference_haplotypes(const std::string& file_path,
const std::unordered_set<std::string>& subset_ids,
std::vector<target_variant>& target_sites,
reduced_haplotypes& typed_only_reference_data,
reduced_haplotypes& full_reference_data)
reduced_haplotypes& full_reference_data,
genetic_map_file* map_file,
float min_recom,
float default_match_error)
{
savvy::reader input(file_path);

Expand All @@ -198,18 +201,21 @@ bool load_reference_haplotypes(const std::string& file_path,
if (!input.read(var))
return std::cerr << "Notice: no variant records in reference query region (" << extended_reg.chromosome() << ":" << extended_reg.from() << "-" << extended_reg.to() << ")\n", input.bad() ? false : true;

double start_cm = 0.;
double no_recom_prob = 1.;
double prev_cm = 0.;
uint32_t prev_ref_pos = 0;
std::vector<std::int8_t> tmp_geno;
unique_haplotype_block block;
auto tar_it = target_sites.begin();
auto recom_it = tar_it;
int res;
while ((res = block.deserialize(input, var)) > 0)
{
block.remove_eov();
if (block.variants().empty() || block.variants().front().pos > extended_reg.to())
break;

block.fill_cm_from_recom(start_cm);
//block.fill_cm_from_recom(start_cm); // TODO: REMOVE

for (auto ref_it = block.variants().begin(); ref_it != block.variants().end(); ++ref_it)
{
Expand All @@ -224,113 +230,62 @@ bool load_reference_haplotypes(const std::string& file_path,
for (std::size_t i = 0; i < tmp_geno.size(); ++i)
tmp_geno[i] = ref_it->gt[block.unique_map()[i]];

if (it->pos == recom_it->pos) // This condition is probably redundant with prev_ref_pos check
recom_it->recom = 0.f;
else
recom_it->recom = std::max<float>(1. - no_recom_prob, 0.f); // min_recom_s3);
assert(recom_it->recom <= 1.f && recom_it->recom >= 0.f);

typed_only_reference_data.compress_variant({ref_it->chrom, ref_it->pos, ref_it->id, ref_it->ref, ref_it->alt, ref_it->err, ref_it->recom, ref_it->cm}, tmp_geno);

it->af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
auto test_af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
it->af = float((--typed_only_reference_data.end())->ac) / tmp_geno.size();
assert(test_af == it->af);
it->err = std::isnan(ref_it->err) ? default_match_error : ref_it->err;
it->in_ref = true;

if (it != tar_it)
std::swap(*it, *tar_it);

no_recom_prob = 1.;
recom_it = tar_it;
++tar_it;
break;
}
}

double switch_prob = std::isnan(ref_it->recom) ? 0.f : ref_it->recom;

if (map_file)
{
double cm = map_file->interpolate_centimorgan(ref_it->pos);
switch_prob = recombination::cm_to_switch_prob(cm - prev_cm);
prev_cm = cm;
}

if (prev_ref_pos > 0 && ref_it->pos != prev_ref_pos)
no_recom_prob *= 1. - std::max<double>(switch_prob, min_recom);

prev_ref_pos = ref_it->pos;
}

block.trim(impute_reg.from(), impute_reg.to());
if (!block.variants().empty())
full_reference_data.append_block(block);
}

assert(recom_it != target_sites.end());
if (recom_it != target_sites.end())
recom_it->recom = 0.f;

if (res < 0)
return false;


return !input.bad();
}
else
{
shrinkwrap::gz::istream input_file(file_path);
std::string line;

std::uint8_t m3vcf_version = 0;
const std::string m3vcf_version_line = "##fileformat=M3VCF";
while (std::getline(input_file, line))
{
if (line.substr(0, m3vcf_version_line.size()) == m3vcf_version_line)
{
if (line == "##fileformat=M3VCFv2.0")
m3vcf_version = 2;
else
m3vcf_version = 1;
break;
}

if (line.size() < 2 || line[1] != '#')
{
std::cerr << "Error: invalid reference file" << std::endl;
return false;
}
}

std::size_t n_samples = 0;
while (std::getline(input_file, line))
{
if (line.size() < 2 || line[1] != '#')
{
n_samples = std::count(line.begin(), line.end(), '\t') - 8;
break;
}
}

double start_cm = 0.;
auto tar_it = target_sites.begin();
unique_haplotype_block block;
std::vector<std::int8_t> tmp_geno;
while (block.deserialize(input_file, m3vcf_version, m3vcf_version == 1 ? n_samples : 2 * n_samples))
{
if (block.variants().empty() || block.variants().front().pos > extended_reg.to())
break;

block.fill_cm_from_recom(start_cm);

for (auto ref_it = block.variants().begin(); ref_it != block.variants().end(); ++ref_it)
{
while (tar_it != target_sites.end() && tar_it->pos < ref_it->pos)
++tar_it;

for (auto it = tar_it; it != target_sites.end() && it->pos == ref_it->pos; ++it)
{
if (it->ref == ref_it->ref && it->alt == ref_it->alt)
{
tmp_geno.resize(block.unique_map().size());
for (std::size_t i = 0; i < tmp_geno.size(); ++i)
tmp_geno[i] = ref_it->gt[block.unique_map()[i]];

typed_only_reference_data.compress_variant({ref_it->chrom, ref_it->pos, ref_it->id, ref_it->ref, ref_it->alt, ref_it->err, ref_it->recom, ref_it->cm}, tmp_geno);

it->af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
it->af = float((--typed_only_reference_data.end())->ac) / tmp_geno.size();
it->in_ref = true;

if (it != tar_it)
std::swap(*it, *tar_it);

++tar_it;
break;
}
}
}

block.trim(impute_reg.from(), impute_reg.to());
if (!block.variants().empty())
full_reference_data.append_block(block);
}
}

return !input.bad();
std::cerr << "Error: failed to open MVCF file" << std::endl;
return false;
}

// Old approach to setting recombination probs.
Expand Down Expand Up @@ -428,7 +383,7 @@ bool load_reference_haplotypes_old_recom_approach(const std::string& file_path,
prev_cm = cm;
}
if (ref_read_cnt)
no_recom_prob *= 1. - prev_recom; //std::max(prev_recom, 1e-5f); // TODO: cm -> recom
no_recom_prob *= 1. - std::max(prev_recom, 1e-5f); // TODO: cm -> recom
prev_recom = ref_it->cm; // TODO: cm -> recom
++ref_read_cnt;
}
Expand Down
16 changes: 14 additions & 2 deletions src/input_prep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,19 @@ bool load_reference_haplotypes(const std::string& file_path,
const std::unordered_set<std::string>& subset_ids,
std::vector<target_variant>& target_sites,
reduced_haplotypes& typed_only_reference_data,
reduced_haplotypes& full_reference_data);
reduced_haplotypes& full_reference_data,
genetic_map_file* map_file,
float min_recom,
float default_match_error);

bool load_reference_haplotypes_old_recom_approach(const std::string& file_path,
const savvy::genomic_region& extended_reg,
const savvy::genomic_region& impute_reg,
const std::unordered_set<std::string>& subset_ids,
std::vector<target_variant>& target_sites,
reduced_haplotypes& typed_only_reference_data,
reduced_haplotypes& full_reference_data,
genetic_map_file* map_file);

std::vector<target_variant> separate_target_only_variants(std::vector<target_variant>& target_sites);

Expand All @@ -29,4 +41,4 @@ bool convert_old_m3vcf(const std::string& input_path, const std::string& output_
bool compress_reference_panel(const std::string& input_path, const std::string& output_path, const std::string& map_file_path = "");


#endif // MINIMAC4_INPUT_PREP_HPP
#endif // MINIMAC4_INPUT_PREP_HPP
17 changes: 9 additions & 8 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ class imputation
start_time = std::time(nullptr);
reduced_haplotypes typed_only_reference_data(16, 512);
reduced_haplotypes full_reference_data;
if (!load_reference_haplotypes(args.ref_path(), extended_region, impute_region, args.sample_ids(), target_sites, typed_only_reference_data, full_reference_data))
std::unique_ptr<genetic_map_file> mf(args.map_path().empty() ? nullptr : new genetic_map_file(args.map_path(), impute_region.chromosome()));
if (!load_reference_haplotypes(args.ref_path(), extended_region, impute_region, args.sample_ids(), target_sites, typed_only_reference_data, full_reference_data, mf.get(), args.min_recom(), args.error_param()))
return std::cerr << "Error: failed loading reference haplotypes\n", false;
std::cerr << "Loading reference haplotypes took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;

Expand All @@ -64,8 +65,8 @@ class imputation
double impute_time = 0.;
double temp_write_time = 0.;

std::list<savvy::reader> temp_files;
std::list<savvy::reader> temp_emp_files;
std::list<savvy::reader> temp_files;
std::list<savvy::reader> temp_emp_files;
// std::list<std::string> temp_files;
// std::list<std::string> temp_emp_files;
full_dosages_results hmm_results;
Expand Down Expand Up @@ -97,11 +98,11 @@ class imputation
if (target_sites.empty())
return std::cerr << "Error: no target variants\n", false;

std::cerr << "Loading switch probabilities ..." << std::endl;
start_time = std::time(nullptr);
if (!load_variant_hmm_params(target_sites, typed_only_reference_data, args.error_param(), args.min_recom(), args.map_path()))
return std::cerr << "Error: parsing map file failed\n", false;
std::cerr << "Loading switch probabilities took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;
// std::cerr << "Loading switch probabilities ..." << std::endl;
// start_time = std::time(nullptr);
// if (!load_variant_hmm_params(target_sites, typed_only_reference_data, args.error_param(), args.min_recom(), args.map_path()))
// return std::cerr << "Error: parsing map file failed\n", false;
// std::cerr << "Loading switch probabilities took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;

auto reverse_maps = generate_reverse_maps(typed_only_reference_data);

Expand Down
10 changes: 5 additions & 5 deletions src/recombination.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,14 @@ genetic_map_file::genetic_map_file(const std::string& map_file_path, const std::
}
}

float genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
double genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
{
if (good_)
{
if (variant_pos < prev_rec_.pos)
{
auto basepair_cm = prev_rec_.map_value / double(prev_rec_.pos);
return float(double(variant_pos) * basepair_cm);
return double(double(variant_pos) * basepair_cm);
}

record temp_rec;
Expand All @@ -176,13 +176,13 @@ float genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
if (variant_pos < cur_rec_.pos)
{
assert(variant_pos - prev_rec_.pos < variant_pos); //TODO: handle gracefully
return float(prev_rec_.map_value + double(variant_pos - prev_rec_.pos) * basepair_cm);
return double(prev_rec_.map_value + double(variant_pos - prev_rec_.pos) * basepair_cm);
}

return float(cur_rec_.map_value + double(variant_pos - cur_rec_.pos) * basepair_cm); // Should we assume that basepair_cm is zero?
return double(cur_rec_.map_value + double(variant_pos - cur_rec_.pos) * basepair_cm); // Should we assume that basepair_cm is zero?
}

return std::numeric_limits<float>::quiet_NaN();;
return std::numeric_limits<double>::quiet_NaN();;
}

bool genetic_map_file::read_record(record& entry)
Expand Down
2 changes: 1 addition & 1 deletion src/recombination.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class genetic_map_file
operator bool() const { return good_; }

// This must be called in order. The pos argument must be >= to pos passed in previous call
float interpolate_centimorgan(std::size_t variant_pos);
double interpolate_centimorgan(std::size_t variant_pos);
private:
bool read_record(record& rec);
};
Expand Down

0 comments on commit 42bead8

Please sign in to comment.