Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MarkDuplicates strategy of flow based reads that looks only at the qualities close to the end of the read #1942

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jacoco.data
.gradle
build
*.swp

.vscode
ilyasoifer marked this conversation as resolved.
Show resolved Hide resolved
# OSX file system stuff
.DS_STORE

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@

public class MarkDuplicatesForFlowArgumentCollection {

public enum FLOW_DUPLICATE_SELECTION_STRATEGY {
FLOW_QUALITY_SUM_STRATEGY,
FLOW_END_QUALITY_STRATEGY
}
@Argument(doc = "enable parameters and behavior specific to flow based reads.", optional = true)
public boolean FLOW_MODE = false;

@Argument(doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " +
"(and correct) quality value is used for all bases of the same homopolymer.", optional = true)
public boolean FLOW_QUALITY_SUM_STRATEGY = false;
@Argument(doc = "Use specific quality summing strategy for flow based reads. Two strategies are available: " +
"FLOW_QUALITY_SUM_STRATEG: The selects the read with the best total homopolymer quality." +
" FLOW_END_QUALITY_STRATEGY: The strategy selects the read with the best homopolymer quality close to the end (10 bases) of the read. " +
" The latter strategy is recommended for samples with high duplication rate ", optional = true)
public FLOW_DUPLICATE_SELECTION_STRATEGY FLOW_DUP_STRATEGY = FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_QUALITY_SUM_STRATEGY;

@Argument(doc = "Make the end location of single end read be significant when considering duplicates, " +
"in addition to the start location, which is always significant (i.e. require single-ended reads to start and" +
Expand All @@ -26,6 +32,11 @@ public class MarkDuplicatesForFlowArgumentCollection {
"(for this argument, \"read end\" means 3' end)", optional = true)
public int UNPAIRED_END_UNCERTAINTY = 0;

@Argument(doc = "Maximal difference of the read start position that counted as equal. Useful for flow based " +
"reads where the end position might vary due to sequencing errors. " +
"(for this argument, \"read start\" means 5' end in the direction of sequencing)", optional = true)
public int UNPAIRED_START_UNCERTAINTY = 0;

@Argument(doc = "Skip first N flows, starting from the read's start, when considering duplicates. Useful for flow based reads where sometimes there " +
"is noise in the first flows " +
"(for this argument, \"read start\" means 5' end).", optional = true)
Expand Down
113 changes: 102 additions & 11 deletions src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.Log;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes;

import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -59,6 +60,7 @@ public class MarkDuplicatesForFlowHelper implements MarkDuplicatesHelper {
public static final char[] CLIPPING_TAG_CONTAINS_AQ = {'A', 'Q'};
public static final char[] CLIPPING_TAG_CONTAINS_QZ = {'Q', 'Z'};

public static final int DIST_FROM_END = 10;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's worth mentioning that the distance from the end is hard coded as 10 bases for the FLOW_END_QUALITY_STRATEGY

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mentioned in the parameter description. Is this OK?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, perfect

// instance of hosting MarkDuplicates
private final MarkDuplicates md;

Expand Down Expand Up @@ -105,19 +107,24 @@ public void generateDuplicateIndexes(final boolean useBarcodes, final boolean in
ReadEndsForMarkDuplicates firstOfNextChunk = null;
int nextChunkRead1Coordinate2Min = Integer.MAX_VALUE;
int nextChunkRead1Coordinate2Max = Integer.MIN_VALUE;
int nextChunkRead1Coordinate1Min = Integer.MAX_VALUE;
int nextChunkRead1Coordinate1Max = Integer.MIN_VALUE;

final List<ReadEndsForMarkDuplicates> nextChunk = new ArrayList<>(200);
boolean containsPairs = false;
boolean containsFrags = false;

for (final ReadEndsForMarkDuplicates next : md.fragSort) {
if (firstOfNextChunk != null && areComparableForDuplicatesWithEndSignificance(firstOfNextChunk, next, useBarcodes,
nextChunkRead1Coordinate2Min, nextChunkRead1Coordinate2Max)) {
nextChunkRead1Coordinate2Min, nextChunkRead1Coordinate2Max, nextChunkRead1Coordinate1Min, nextChunkRead1Coordinate1Max)) {
nextChunk.add(next);
containsPairs = containsPairs || next.isPaired();
containsFrags = containsFrags || !next.isPaired();
if ( next.read2Coordinate != END_INSIGNIFICANT_VALUE) {
nextChunkRead1Coordinate2Min = Math.min(nextChunkRead1Coordinate2Min, next.read2Coordinate);
nextChunkRead1Coordinate2Max = Math.max(nextChunkRead1Coordinate2Max, next.read2Coordinate);
nextChunkRead1Coordinate1Min = Math.min(nextChunkRead1Coordinate1Min, next.read1Coordinate);
nextChunkRead1Coordinate1Max = Math.max(nextChunkRead1Coordinate1Max, next.read1Coordinate);

if ( firstOfNextChunk.read2Coordinate == END_INSIGNIFICANT_VALUE)
firstOfNextChunk = next;
Expand All @@ -129,6 +136,7 @@ public void generateDuplicateIndexes(final boolean useBarcodes, final boolean in
nextChunk.clear();
nextChunk.add(next);
firstOfNextChunk = next;
nextChunkRead1Coordinate1Min = nextChunkRead1Coordinate1Max = next.read1Coordinate;
if ( next.read2Coordinate != END_INSIGNIFICANT_VALUE)
nextChunkRead1Coordinate2Min = nextChunkRead1Coordinate2Max = next.read2Coordinate;
else {
Expand Down Expand Up @@ -172,9 +180,7 @@ public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final
}

// adjust score
if ( md.flowBasedArguments.FLOW_QUALITY_SUM_STRATEGY ) {
ends.score = computeFlowDuplicateScore(rec, ends.read2Coordinate);
}
ends.score = computeFlowDuplicateScore(rec, ends.read2Coordinate);

return ends;
}
Expand All @@ -184,22 +190,53 @@ public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final
*/
@Override
public short getReadDuplicateScore(final SAMRecord rec, final ReadEndsForMarkDuplicates pairedEnds) {
if (md.flowBasedArguments.FLOW_QUALITY_SUM_STRATEGY ) {
if (md.flowBasedArguments.FLOW_MODE){
return computeFlowDuplicateScore(rec, pairedEnds.read2Coordinate);
} else {
return md.getReadDuplicateScore(rec, pairedEnds);
}
}

//This method is identical to MarkDuplicates.areComparableForDuplicates but allows working with readStartUncertainty
protected boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs,
final boolean compareRead2, final boolean useBarcodes) {
boolean areComparable = lhs.libraryId == rhs.libraryId;

if (useBarcodes && areComparable) { // areComparable is useful here to avoid the casts below
final ReadEndsForMarkDuplicatesWithBarcodes lhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) lhs;
final ReadEndsForMarkDuplicatesWithBarcodes rhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) rhs;
areComparable = lhsWithBarcodes.barcode == rhsWithBarcodes.barcode &&
lhsWithBarcodes.readOneBarcode == rhsWithBarcodes.readOneBarcode &&
lhsWithBarcodes.readTwoBarcode == rhsWithBarcodes.readTwoBarcode;
}

if (areComparable) {
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
lhs.orientation == rhs.orientation;
}

if (areComparable && compareRead2) {
areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
lhs.read2Coordinate == rhs.read2Coordinate;
}

return areComparable;
}

/**
* This method is identical in function to areComparableForDuplicates except that it accomodates for
* the possible significance of the end side of the reads (w/ or wo/ uncertainty). This is only
* applicable for flow mode invocation.
*/
private boolean areComparableForDuplicatesWithEndSignificance(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean useBarcodes,
final int lhsRead1Coordinate2Min, final int lhsRead1Coordinate2Max) {
boolean areComparable = md.areComparableForDuplicates(lhs, rhs, false, useBarcodes);
final int lhsRead1Coordinate2Min, final int lhsRead1Coordinate2Max,
final int lhsRead1Coordinate1Min, final int lhsRead1Coordinate1Max) {
boolean areComparable = areComparableForDuplicates(lhs, rhs, false, useBarcodes);

if (areComparable) {
areComparable = endCoorInRangeWithUncertainty(lhsRead1Coordinate1Min, lhsRead1Coordinate1Max,
rhs.read1Coordinate, md.flowBasedArguments.UNPAIRED_START_UNCERTAINTY);
}
if (areComparable) {
areComparable = (!endCoorSignificant(lhs.read2Coordinate, rhs.read2Coordinate) ||
endCoorInRangeWithUncertainty(lhsRead1Coordinate2Min, lhsRead1Coordinate2Max,
Expand Down Expand Up @@ -236,8 +273,8 @@ static protected int getFlowSumOfBaseQualities(final SAMRecord rec, final int th
final byte[] bases = rec.getReadBases();

// create iteration range and direction
final int startingOffset = !rec.getReadNegativeStrandFlag() ? 0 : bases.length;
final int endOffset = !rec.getReadNegativeStrandFlag() ? bases.length : 0;
final int startingOffset = !rec.getReadNegativeStrandFlag() ? 0 : bases.length - 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was this previously a bug? I don't see how this change is related to allowing a range in the start position or changing the quality score calculation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was a bug indeed

final int endOffset = !rec.getReadNegativeStrandFlag() ? bases.length : -1;
final int iterIncr = !rec.getReadNegativeStrandFlag() ? 1 : -1;

// loop on bases, extract qual related to homopolymer from start of homopolymer
Expand All @@ -257,6 +294,55 @@ static protected int getFlowSumOfBaseQualities(final SAMRecord rec, final int th
return score;
}

/**
* A quality selection strategy used for flow based reads.
*
* We look at the bases of the reads that are close to the ends of the fragment
* and calculate the minimal quality of the homopolymers.
*
* @param rec - SAMRecord to get a score for
* @param dist - Distance from the end
* @return - calculated score (see method description)
*/
static protected int getFlowSumOfBaseQualitiesNearEnds(final SAMRecord rec, int dist) {
int score = 100;

// access qualities and bases
final byte[] quals = rec.getBaseQualities();
final byte[] bases = rec.getReadBases();

boolean insideHpol = false;
if (dist > bases.length){
dist = bases.length;
}

for ( int i = 0 ; (i < dist) || ( insideHpol ) ; i ++ ) {
final byte base = bases[i];
if ( (i == bases.length - 1) || ( base != bases[i+1] )) {
insideHpol = false;
} else {
insideHpol = true;
}

if ( quals[i] < score) {
score = quals[i];
}
}

for ( int i = bases.length-1 ; (i > bases.length - 1 - dist) || ( insideHpol ) ; i -- ) {
final byte base = bases[i];
if ( (i == 0) || ( base != bases[i - 1] )) {
insideHpol = false;
} else {
insideHpol = true;
}

if ( quals[i] < score) {
score = quals[i];
}
}
return score;
}
private short computeFlowDuplicateScore(final SAMRecord rec, final int end) {

if ( end == END_INSIGNIFICANT_VALUE)
Expand All @@ -265,8 +351,13 @@ private short computeFlowDuplicateScore(final SAMRecord rec, final int end) {
Short storedScore = (Short)rec.getTransientAttribute(ATTR_DUPLICATE_SCORE);
if ( storedScore == null ) {
short score = 0;

score += (short) Math.min(getFlowSumOfBaseQualities(rec, md.flowBasedArguments.FLOW_EFFECTIVE_QUALITY_THRESHOLD), Short.MAX_VALUE / 2);
if (md.flowBasedArguments.FLOW_DUP_STRATEGY == MarkDuplicatesForFlowArgumentCollection.FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_QUALITY_SUM_STRATEGY) {
score += (short) Math.min(getFlowSumOfBaseQualities(rec, md.flowBasedArguments.FLOW_EFFECTIVE_QUALITY_THRESHOLD), Short.MAX_VALUE / 2);
} else if (md.flowBasedArguments.FLOW_DUP_STRATEGY == MarkDuplicatesForFlowArgumentCollection.FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_END_QUALITY_STRATEGY) {
score += (short) Math.min(getFlowSumOfBaseQualitiesNearEnds(rec, DIST_FROM_END), Short.MAX_VALUE / 2);
} else {
throw new IllegalArgumentException("Unknown flow duplicate selection strategy: " + md.flowBasedArguments.FLOW_DUP_STRATEGY);
}

score += rec.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0;
storedScore = score;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,16 @@ public Object[][] forFlowDataProvider() {
new TestRecordInfo(76, 12,"76M", true, "AAAC", null),
new TestRecordInfo(76, 12, "76M", false, "AACC", null)
},
new String[] { "FLOW_QUALITY_SUM_STRATEGY=false" },
new String[] { "FLOW_DUP_STRATEGY=FLOW_QUALITY_SUM_STRATEGY" },
new TesterModifier() {
@Override
public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) {
final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]);
records[0].setAttribute("tp", new int[76]);
records[1].setAttribute("tp", new int[76]);
records[0].getBaseQualities()[1] = 25; // dip inside AAA
records[0].getBaseQualities()[0] = 25; // dip inside AAA
records[0].getBaseQualities()[2] = 25; // dip inside AAA

}
}
},
Expand All @@ -185,14 +187,33 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester)
new TestRecordInfo(76, 12,"76M", false, "AAAC", null),
new TestRecordInfo(76, 12, "76M", true, "AACC", null)
},
new String[] { "FLOW_QUALITY_SUM_STRATEGY=true" },
new String[] { "FLOW_DUP_STRATEGY=FLOW_QUALITY_SUM_STRATEGY" },
new TesterModifier() {
@Override
public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) {
final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]);
records[0].setAttribute("tp", new int[76]);
records[1].setAttribute("tp", new int[76]);
records[0].getBaseQualities()[1] = 25; // dip inside AAA
}
}
},
// testFLOW_END_QUALITY_STRATEGY: flow (homopolymer based) minumum
{
DuplicateScoringStrategy.ScoringStrategy.SUM_OF_BASE_QUALITIES,
new TestRecordInfo[] {
new TestRecordInfo(76, 12,"76M", true, "AAAC", null),
new TestRecordInfo(76, 12, "76M", false, "AACC", null)
},
new String[] { "FLOW_DUP_STRATEGY=FLOW_END_QUALITY_STRATEGY" },
new TesterModifier() {
@Override
public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) {
final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]);
records[0].setAttribute("tp", new int[76]);
records[1].setAttribute("tp", new int[76]);
records[0].getBaseQualities()[1] = 25; // dip inside AAA
records[1].getBaseQualities()[30] = 10;
}
}
},
Expand All @@ -208,6 +229,17 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester)
new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_END_UNCERTAINTY=10" }, null
},

// testUNPAIRED_START_UNCERTAINTY: End location is significant and uncertain, end sorted
{
null,
new TestRecordInfo[] {
new TestRecordInfo(74, 12, null, false, null, null),
new TestRecordInfo(64, 22, null, true, null, null),
new TestRecordInfo(54, 32, null, true, null, null)
},
new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_START_UNCERTAINTY=10" }, null
},

// testUNPAIRED_END_UNCERTAINTY: End location is significant and uncertain, end not sorted
{
null,
Expand Down Expand Up @@ -267,7 +299,7 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester)
}

@Test(dataProvider = "forFlowDataProvider")
public void testForFlow(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy, final TestRecordInfo[] recInfos, final String[] params, TesterModifier modifier) {
public void testForFlowMDCall(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy, final TestRecordInfo[] recInfos, final String[] params, TesterModifier modifier) {

// get tester, build records
final AbstractMarkDuplicatesCommandLineProgramTester tester =
Expand Down Expand Up @@ -339,4 +371,33 @@ public void testGetFlowSumOfBaseQualities(final String bases, final byte[] quals
Assert.assertEquals(score, expectedScore);
}

@DataProvider(name ="getFlowEndBaseQualitiesDataProvider")
public Object[][] getFlowEndBaseQualitiesDataProvider() {
return new Object[][] {
{ "AAAA", new byte[] {50,50,50,50}, 2, 50 },
{ "AAAA", new byte[] {50,10,10,50}, 4, 10 },
{ "ACCA", new byte[] {20,10,10,20}, 1, 20 },
{ "AABBCC", new byte[] {50,50,10,10,40,40}, 30, 10 },
};
}

@Test(dataProvider = "getFlowEndBaseQualitiesDataProvider")
public void testGetFlowEndBaseQualities(final String bases, final byte[] quals, final int threshold, final int expectedScore) {

// build record
final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(bases.length());
tester.addMappedFragment(0, 12, false, 50);

// install bases and quals
final SAMRecord rec = tester.getSamRecordSetBuilder().getRecords().iterator().next();
System.arraycopy(bases.getBytes(), 0, rec.getReadBases(), 0,bases.length());
System.arraycopy(quals, 0, rec.getBaseQualities(), 0, quals.length);

// calculate score
final int score = MarkDuplicatesForFlowHelper.getFlowSumOfBaseQualitiesNearEnds(rec, threshold);
Assert.assertEquals(score, expectedScore);
}


}
Loading