Skip to content

Commit

Permalink
removed debug print. VCF header fix. Better event de-duping
Browse files Browse the repository at this point in the history
  • Loading branch information
walaj committed Dec 20, 2016
1 parent 5213029 commit b85e1e6
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 65 deletions.
58 changes: 45 additions & 13 deletions src/Snowman/BreakPoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ using namespace SeqLib;


// make a breakpoint from a discordant cluster
BreakPoint::BreakPoint(DiscordantCluster& tdc, const BWAWrapper * bwa, DiscordantClusterMap& dmap) {
BreakPoint::BreakPoint(DiscordantCluster& tdc, const BWAWrapper * bwa, DiscordantClusterMap& dmap,
const GenomicRegion& region) {

num_align = 0;
dc = tdc;
Expand Down Expand Up @@ -171,7 +172,8 @@ using namespace SeqLib;
}

// give a unique id
cname = dc.toRegionString();
cname = dc.toRegionString() + "__" + std::to_string(region.chr+1) + "_" + std::to_string(region.pos1) +
"_" + std::to_string(region.pos2) + "D";

// check if another cluster overlaps, but different strands
if (getSpan() > 800 || getSpan() == -1) // only check for large events.
Expand Down Expand Up @@ -755,7 +757,7 @@ BreakEnd::BreakEnd(const SeqLib::BamRecord& b) {
// aligned to way off region. Saw cases where this happend in tumor
// and not normal, so false-called germline event as somatic.
confidence = "NOLOCAL";
if (has_local_alignment)
else if (has_local_alignment)
confidence = "LOCALMATCH";
else if ( num_split > 1 && ( (cov_span <= (readlen + 5 ) && cov_span > 0) || cov_span < 0) )
confidence = "DUPREADS"; // the same sequences keep covering the split
Expand Down Expand Up @@ -1001,14 +1003,14 @@ void BreakPoint::scoreBreakpoint(double LOD_CUTOFF, double LOD_CUTOFF_DBSNP, dou
}

// provide a scaled LOD that accounts for MAPQ. Heuristic, not really used
int mapqr1 = b1.local ? std::max(30, b1.mapq) : b1.mapq; // if local, don't drop below 30
int mapqr2 = b2.local ? std::max(30, b2.mapq) : b2.mapq; // if local (aligns to within window), don't drop below 30
double scale = (double)( std::min(mapqr1, mapqr2) - 2 * b1.nm) / (double)60;
for (auto& i : allele)
i.second.SLO = i.second.LO * scale;
t.SLO = t.LO * scale;
n.SLO = n.LO * scale;
a.SLO = a.LO * scale;
//int mapqr1 = b1.local ? std::max(30, b1.mapq) : b1.mapq; // if local, don't drop below 30
//int mapqr2 = b2.local ? std::max(30, b2.mapq) : b2.mapq; // if local (aligns to within window), don't drop below 30
//double scale = (double)( std::min(mapqr1, mapqr2) - 2 * b1.nm) / (double)60;
//for (auto& i : allele)
// i.second.SLO = i.second.LO * scale;
//t.SLO = t.LO * scale;
//n.SLO = n.LO * scale;
//a.SLO = a.LO * scale;

// sanity check
int split =0;
Expand Down Expand Up @@ -1510,11 +1512,11 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
if (indel)
ss << std::setprecision(4) << genotype << ":" <<
std::max(alt, cigar) << ":" << cov << ":" << GQ << ":" << PL << ":" << split << ":" << cigar
<< ":" << LO_n << ":" << LO << ":" << SLO;
<< ":" << LO_n << ":" << LO;// << ":" << SLO;
else
ss << std::setprecision(4) << genotype << ":" <<
alt << ":" << cov << ":" << GQ << ":" << PL << ":" << split
<< ":" << disc << ":" << LO_n << ":" << LO << ":" << SLO;
<< ":" << disc << ":" << LO_n << ":" << LO;// << ":" << SLO;

return ss.str();

Expand Down Expand Up @@ -1653,3 +1655,33 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH
return val;
}

bool ReducedBreakPoint::operator<(const ReducedBreakPoint& bp) const {

//ASDIS > ASSMB > COMP > DSCRD
if (std::strcmp(evidence,bp.evidence) < 0) // <
return true;
else if (std::strcmp(evidence, bp.evidence) > 0) // >
return false;

if (nsplit > bp.nsplit)
return true;
else if (nsplit < bp.nsplit)
return false;

if (tsplit > bp.tsplit)
return true;
else if (tsplit < bp.tsplit)
return false;

if (dc.ncount > bp.dc.ncount)
return true;
else if (dc.ncount < bp.dc.ncount)
return false;

if (dc.tcount > bp.dc.tcount)
return true;
else if (dc.tcount < bp.dc.tcount)
return false;

return false;
}
6 changes: 5 additions & 1 deletion src/Snowman/BreakPoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ struct ReducedBreakEnd {

}

// define how to sort these
bool operator<(const ReducedBreakPoint& bp) const;

ReducedBreakPoint() {}
~ReducedBreakPoint() {
__smart_check_free(ref);
Expand Down Expand Up @@ -257,7 +260,8 @@ struct ReducedBreakEnd {

/** Construct a breakpoint from a cluster of discordant reads
*/
BreakPoint(DiscordantCluster& tdc, const SeqLib::BWAWrapper * bwa, DiscordantClusterMap& dmap);
BreakPoint(DiscordantCluster& tdc, const SeqLib::BWAWrapper * bwa, DiscordantClusterMap& dmap,
const SeqLib::GenomicRegion& region);

BreakPoint() {}

Expand Down
1 change: 0 additions & 1 deletion src/Snowman/DiscordantCluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,6 @@ using namespace SeqLib;
for (auto& i : brv) {

// only cluster FR reads together, RF reads together, FF together and RR together
std::cout << i.Qname() << "\t" << i.PairOrientation() << std::endl;
if (i.PairOrientation() != orientation) {
continue;
}
Expand Down
13 changes: 4 additions & 9 deletions src/Snowman/SnowmanAssemblerEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,12 @@ void SnowmanAssemblerEngine::doAssembly(ReadTable *pRT, SeqLib::UnalignedSequenc
int cutoff = 0;

if (pass > 0) {
errorRate = 0.015f; // up to one bp mismatch
//errorRate = 0.015f; // up to one bp mismatch
errorRate = 0;
m_error_rate = errorRate;
min_overlap = 70;
min_overlap = m_min_overlap * 2;
}

bool exact = errorRate < 0.001f;

// remove duplicates if running in exact mode
Expand All @@ -189,12 +190,6 @@ void SnowmanAssemblerEngine::doAssembly(ReadTable *pRT, SeqLib::UnalignedSequenc
pSAf_nd->writeIndex();
pSAr_nd->writeIndex();

// do the cutoff later...
//if (pass > 0)
// cutoff = m_readlen * 1.10;

//int seedLength = min_overlap;
//int seedStride = seedLength;
bool bIrreducibleOnly = true; // default
int seedLength = 0;
int seedStride = 0;
Expand Down
6 changes: 3 additions & 3 deletions src/Snowman/run_snowman.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ void runSnowman(int argc, char** argv) {

// set the min overlap
if (!opt::sga::minOverlap)
opt::sga::minOverlap = (0.6 * readlen) < 30 ? 30 : 0.5 * readlen;
opt::sga::minOverlap = (0.6 * readlen) < 30 ? 30 : 0.6 * readlen;

ss << "...found read length of " << readlen << ". Min Overlap is " << opt::sga::minOverlap << std::endl;
ss << "...max read MAPQ detected: " << max_mapq_possible << std::endl;
Expand Down Expand Up @@ -1045,7 +1045,7 @@ bool runWorkUnit(const SeqLib::GenomicRegion& region, SnowmanWorkUnit& wu, long
i.second.m_reg1.chr == i.second.m_reg2.chr;
// DiscordantCluster not associated with assembly BP and has 2+ read support
if (!i.second.hasAssociatedAssemblyContig() && (i.second.tcount + i.second.ncount) > 1 && i.second.valid() && !below_size) {
BreakPoint tmpbp(i.second, main_bwa, dmap);
BreakPoint tmpbp(i.second, main_bwa, dmap, region);
bp_glob.push_back(tmpbp);
}
}
Expand Down Expand Up @@ -1092,7 +1092,7 @@ bool runWorkUnit(const SeqLib::GenomicRegion& region, SnowmanWorkUnit& wu, long
}
for (auto& i : bp_glob) {
if (ccc[i.b1.hash()] > 1)
i.confidence = "REPSEQ";
i.confidence = "REPVAR";
}

// remove somatic calls if they have a germline normal SV in them or indels with
Expand Down
57 changes: 19 additions & 38 deletions src/Snowman/vcf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@

#define VCF_SECONDARY_CAP 200
#define SOMATIC_LOD 1

#define DEDUPEPAD 200

using namespace std;

static std::string sv_format = "GT:AD:DP:GQ:PL:SR:DR:LR:LO:SL"; //"NALT:NALT_RP:NALT_SR:READ_ID";
static std::string indel_format = "GT:AD:DP:GQ:PL:SR:CR:LR:LO:SL";
static std::string sv_format = "GT:AD:DP:GQ:PL:SR:DR:LR:LO";
static std::string indel_format = "GT:AD:DP:GQ:PL:SR:CR:LR:LO";
static InfoMap flag_map;
static int global_id = 0;
static std::stringstream lod_ss;
Expand Down Expand Up @@ -240,17 +240,12 @@ VCFFile::VCFFile(std::string file, std::string id, const SeqLib::BamHeader& h, c
indel_header.addFilterField("LOWMAPQ","Assembly contig has less than MAPQ 10");
indel_header.addFilterField("SHORTALIGNMENT","Matched (M) contig frag to left or right of indel < 20 bp");
indel_header.addFilterField("LOWLOD","LOD score is less than the cutoff");
//indel_header.addFilterField("WEAKASSEMBLY","4 or fewer split reads");
//indel_header.addFilterField("WEAKCIGARMATCH","For indels <= 5 bp, require 8+ split reads or 4+ and 3+ cigar matches");
indel_header.addFilterField("MULTIMATCH", "Low MAPQ and this contig fragment maps well to multiple locations");
indel_header.addFilterField("LOWAS","Less than 80% of contig length is covered by a supporting read");
indel_header.addFilterField("NONVAR","0/0 is the most likely genotype");
indel_header.addFilterField("PASS", "LOD score pass");
//indel_header.addFilterField("GRAYLISTANDPON", "Indel overlaps with panel of normals, and has overlap with tricky genomic region");
indel_header.addFilterField("VLOWAF", "allelic fraction < 0.05");
//indel_header.addFilterField("LOWNORMCOV", "Fewer than 5 normal reads at this site");
//indel_header.addFilterField("GERMLOWAF", "Germline variant with support for being AF of 0.5+ (LR) less than cutoff");

//indel_header.addSampleField(sample_id_norm);
//indel_header.addSampleField(sample_id_tum);
//indel_header.colnames = indel_header.colnames + "\t" + sample_id_norm + "\t" + sample_id_tum;
indel_header.addFilterField("REPVAR", "Multiple conflicting variants at a highly repetitive region");

//add the SV info fields
sv_header.addInfoField("REPSEQ","1","String","Repeat sequence near the event");
Expand All @@ -263,22 +258,10 @@ VCFFile::VCFFile(std::string file, std::string id, const SeqLib::BamHeader& h, c
sv_header.addInfoField("IMPRECISE","0","Flag", "Imprecise structural variation");
sv_header.addInfoField("SECONDARY","0","Flag", "SV calls comes from a secondary alignment");
sv_header.addInfoField("HOMLEN","1","Integer","Length of base pair identical micro-homology at event breakpoints");
//sv_header.addInfoField("DBSNP","1","String","TRUE if variant overlaps a dbSNP site");
//sv_header.addInfoField("BKDIST","1","Integer","Distance between breakpoints (-1 if difference chromosomes");
sv_header.addInfoField("MAPQ","1","Integer","Mapping quality (BWA-MEM) of this fragement of the contig (-1 if discordant only)");
sv_header.addInfoField("MATEMAPQ","1","Integer","Mapping quality of the partner fragment of the contig");
//sv_header.addInfoField("NSPLIT","1","Integer","Number of split reads from the normal BAM");
//sv_header.addInfoField("TSPLIT","1","Integer","Number of split reads from the tumor BAM");
//sv_header.addInfoField("TDISC","1","Integer","Number of discordant read pairs from the tumor BAM");
//sv_header.addInfoField("NDISC","1","Integer","Number of discordant read pairs from the normal BAM");
sv_header.addInfoField("MATEID","1","String","ID of mate breakends");
//sv_header.addInfoField("SOMATIC","0","Flag","Variant is somatic");
sv_header.addInfoField("SUBN","1","Integer","Number of secondary alignments associated with this contig fragment");
//sv_header.addInfoField("TCOV","1","Integer","Max tumor coverage at break");
//sv_header.addInfoField("NCOV","1","Integer","Max normal coverage at break");
//sv_header.addInfoField("TFRAC","1","String","Tumor allelic fraction at break. -1 for undefined");
//sv_header.addInfoField("NFRAC","1","String","Normal allelic fraction at break. -1 for undefined");

sv_header.addInfoField("NUMPARTS","1","Integer","If detected with assembly, number of parts the contig maps to. Otherwise 0");
sv_header.addInfoField("EVDNC","1","String","Provides type of evidence for read. ASSMB is assembly only, ASDIS is assembly+discordant. DSCRD is discordant only.");
sv_header.addInfoField("SCTG","1","String","Identifier for the contig assembled by SnowmanSV to make the SV call");
Expand All @@ -290,32 +273,29 @@ VCFFile::VCFFile(std::string file, std::string id, const SeqLib::BamHeader& h, c
indel_header.addInfoField("SCTG","1","String","Identifier for the contig assembled by SnowmanSV to make the indel call");
indel_header.addInfoField("MAPQ","1","Integer","Mapping quality (BWA-MEM) of the assembled contig");
indel_header.addInfoField("SPAN","1","Integer","Size of the indel");

indel_header.addFormatField("GT", "1","String", "Genotype (currently not supported. Always 0/1)");
indel_header.addInfoField("REPSEQ","1","String","Repeat sequence near the variant");
indel_header.addFormatField("GT", "1","String", "Most likely genotype");
indel_header.addFormatField("AD", "1","Integer", "Allele depth: Number of reads supporting the variant");
indel_header.addFormatField("DP","1","Integer","Depth of coverage: Number of reads covering site.");
indel_header.addFormatField("GQ", "1","String", "Genotype quality (currently not supported. Always 0)");
indel_header.addFormatField("PL","1","Integer","Normalized likelihood of the current genotype (currently not supported, always 0)");
indel_header.addFormatField("PL",".","Float","Normalized likelihood of the current genotype");
indel_header.addFormatField("SR","1","Integer","Number of spanning reads for this variants");
indel_header.addFormatField("CR","1","Integer","Number of cigar-supported reads for this variant");
indel_header.addFormatField("LR","1","Float","Log-odds that this variant is AF=0 vs AF>=0.5");
indel_header.addFormatField("LO","1","Float","Log-odds that this variant is real vs artifact");
indel_header.addFormatField("SL","1","Float","Alignment-quality Scaled log-odds, where LO is LO * (MAPQ - 2*NM)/60");

sv_header.addFormatField("GT", "1","String", "Genotype (currently not supported. Always 0/1)");
sv_header.addFormatField("GT", "1","String", "Most likely genotype");
sv_header.addFormatField("AD", "1","Integer", "Allele depth: Number of reads supporting the variant");
sv_header.addFormatField("DP","1","Integer","Depth of coverage: Number of reads covering site.");
sv_header.addFormatField("GQ", "1","String", "Genotype quality (currently not supported. Always 0)");
sv_header.addFormatField("PL","1","Integer","Normalized likelihood of the current genotype (currently not supported, always 0)");
sv_header.addFormatField("PL",".","Float","Normalized likelihood of the current genotype");
sv_header.addFormatField("SR","1","Integer","Number of spanning reads for this variants");
sv_header.addFormatField("DR","1","Integer","Number of discordant-supported reads for this variant");
sv_header.addFormatField("LR","1","Float","Log-odds that this variant is REF vs AF=0.5");
sv_header.addFormatField("LO","1","Float","Log-odds that this variant is real vs artifact");
sv_header.addFormatField("SL","1","Float","Alignment-quality Scaled log-odds, where LO is LO * (MAPQ - 2*NM)/60");

indel_header.addInfoField("REPSEQ","1","String","Repeat sequence near the event");
//indel_header.addInfoField("GRAYLIST","0","Flag","Indel is low quality and cross a difficult region of genome");
//indel_header.addInfoField("SOMATIC","0","Flag","Variant is somatic");
indel_header.addInfoField("PON","1","Integer","Number of normal samples that have this indel present");
indel_header.addInfoField("NM","1","Integer","Number of mismatches of this alignment fragment to reference");
indel_header.addInfoField("READNAMES",".","String","IDs of ALT reads");
Expand Down Expand Up @@ -410,11 +390,11 @@ void VCFFile::deduplicate() {
for (auto& i : entry_pairs) {

// if it's already de-duped, dont do it again
if (dups.count(i.first))
continue;
//if (dups.count(i.first))
// continue;

// if both ends are close (within 10) then they match
pad = (i.second->bp->b1.gr.chr != i.second->bp->b2.gr.chr) || std::abs(i.second->bp->b1.gr.pos1 - i.second->bp->b2.gr.pos1) > 100 ? 10 : 1;
// if both ends are close then they match
pad = (i.second->bp->b1.gr.chr != i.second->bp->b2.gr.chr) || std::abs(i.second->bp->b1.gr.pos1 - i.second->bp->b2.gr.pos1) > DEDUPEPAD*2 ? DEDUPEPAD : 10;

++count;
SeqLib::GenomicIntervalVector giv1, giv2;
Expand Down Expand Up @@ -445,10 +425,11 @@ void VCFFile::deduplicate() {
if (grv2.at(j.value).id != i.first && ( is_pass == (grv2.at(j.value).pass) )) //j is hit, grv2.at(j.value).id is key of hit
++key_count[grv2.at(j.value).id];

//loop through hit keys and if key is hit twice (1 left, 1 right), it is an overlap
// loop through hit keys and if key is hit twice (1 left, 1 right), it is an overlap
for (auto& j : key_count) {
if (j.second == 2) { // left and right hit for this key. add
dups.insert(j.first);
if (*i.second->bp < *entry_pairs[j.first]->bp) // this has worst read coverage that what it overlaps, so mark as dup
dups.insert(j.first);
}
}
}
Expand Down

0 comments on commit b85e1e6

Please sign in to comment.