MarkDuplicates strategy of flow based reads that looks only at the qu…

…alities close to the end of the read (#1942) * Fixed a bug in getFlowSumOfBaseQualities * uncertain start position parameter in MarkDuplicates * Modified algorithm to determine the best read * Added a new duplicate selection strategy that looks close to the end of the read * Test fixed and strategy test added * added .vscode
broadinstitute · Feb 14, 2024 · c026f94 · c026f94
1 parent 692918d
commit c026f94
Show file tree

Hide file tree

Showing 4 changed files with 182 additions and 19 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,7 +16,7 @@ jacoco.data
 .gradle
 build
 *.swp
-
+.vscode
 # OSX file system stuff
 .DS_STORE
 

diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java
@@ -4,12 +4,18 @@
 
 public class MarkDuplicatesForFlowArgumentCollection {
 
+    public enum FLOW_DUPLICATE_SELECTION_STRATEGY {
+        FLOW_QUALITY_SUM_STRATEGY,
+        FLOW_END_QUALITY_STRATEGY
+    }
     @Argument(doc = "enable parameters and behavior specific to flow based reads.", optional = true)
     public boolean FLOW_MODE = false;
 
-    @Argument(doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " +
-            "(and correct) quality value is used for all bases of the same homopolymer.", optional = true)
-    public boolean FLOW_QUALITY_SUM_STRATEGY = false;
+    @Argument(doc = "Use specific quality summing strategy for flow based reads. Two strategies are available: " + 
+          "FLOW_QUALITY_SUM_STRATEG: The selects the read with the best total homopolymer quality." + 
+            " FLOW_END_QUALITY_STRATEGY: The strategy selects the read with the best homopolymer quality close to the end (10 bases) of the read. " +
+            " The latter strategy is recommended for samples with high duplication rate ", optional = true)
+    public FLOW_DUPLICATE_SELECTION_STRATEGY FLOW_DUP_STRATEGY = FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_QUALITY_SUM_STRATEGY;
 
     @Argument(doc = "Make the end location of single end read be significant when considering duplicates, " +
             "in addition to the start location, which is always significant (i.e. require single-ended reads to start and" +
@@ -26,6 +32,11 @@ public class MarkDuplicatesForFlowArgumentCollection {
             "(for this argument, \"read end\" means 3' end)", optional = true)
     public int UNPAIRED_END_UNCERTAINTY = 0;
 
+    @Argument(doc = "Maximal difference of the read start position that counted as equal. Useful for flow based " +
+            "reads where the end position might vary due to sequencing errors. " +
+            "(for this argument, \"read start\" means 5' end in the direction of sequencing)", optional = true)
+    public int UNPAIRED_START_UNCERTAINTY = 0;
+
     @Argument(doc = "Skip first N flows, starting from the read's start, when considering duplicates. Useful for flow based reads where sometimes there " +
             "is noise in the first flows " +
             "(for this argument, \"read start\" means 5' end).", optional = true)

diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java
@@ -30,6 +30,7 @@
 import htsjdk.samtools.SAMRecord;
 import htsjdk.samtools.util.Log;
 import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
+import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -59,6 +60,7 @@ public class MarkDuplicatesForFlowHelper implements MarkDuplicatesHelper {
     public static final char[]  CLIPPING_TAG_CONTAINS_AQ = {'A', 'Q'};
     public static final char[]  CLIPPING_TAG_CONTAINS_QZ = {'Q', 'Z'};
 
+    public static final int DIST_FROM_END = 10;
     // instance of hosting MarkDuplicates
     private final MarkDuplicates  md;
 
@@ -105,19 +107,24 @@ public void generateDuplicateIndexes(final boolean useBarcodes, final boolean in
         ReadEndsForMarkDuplicates firstOfNextChunk = null;
         int nextChunkRead1Coordinate2Min = Integer.MAX_VALUE;
         int nextChunkRead1Coordinate2Max = Integer.MIN_VALUE;
+        int nextChunkRead1Coordinate1Min = Integer.MAX_VALUE;
+        int nextChunkRead1Coordinate1Max = Integer.MIN_VALUE;
+
         final List<ReadEndsForMarkDuplicates> nextChunk = new ArrayList<>(200);
         boolean containsPairs = false;
         boolean containsFrags = false;
 
         for (final ReadEndsForMarkDuplicates next : md.fragSort) {
             if (firstOfNextChunk != null && areComparableForDuplicatesWithEndSignificance(firstOfNextChunk, next, useBarcodes,
-                    nextChunkRead1Coordinate2Min, nextChunkRead1Coordinate2Max)) {
+                    nextChunkRead1Coordinate2Min, nextChunkRead1Coordinate2Max, nextChunkRead1Coordinate1Min, nextChunkRead1Coordinate1Max)) {
                 nextChunk.add(next);
                 containsPairs = containsPairs || next.isPaired();
                 containsFrags = containsFrags || !next.isPaired();
                 if ( next.read2Coordinate != END_INSIGNIFICANT_VALUE) {
                     nextChunkRead1Coordinate2Min = Math.min(nextChunkRead1Coordinate2Min, next.read2Coordinate);
                     nextChunkRead1Coordinate2Max = Math.max(nextChunkRead1Coordinate2Max, next.read2Coordinate);
+                    nextChunkRead1Coordinate1Min = Math.min(nextChunkRead1Coordinate1Min, next.read1Coordinate);
+                    nextChunkRead1Coordinate1Max = Math.max(nextChunkRead1Coordinate1Max, next.read1Coordinate);
 
                     if ( firstOfNextChunk.read2Coordinate == END_INSIGNIFICANT_VALUE)
                         firstOfNextChunk = next;
@@ -129,6 +136,7 @@ public void generateDuplicateIndexes(final boolean useBarcodes, final boolean in
                 nextChunk.clear();
                 nextChunk.add(next);
                 firstOfNextChunk = next;
+                nextChunkRead1Coordinate1Min = nextChunkRead1Coordinate1Max = next.read1Coordinate;
                 if ( next.read2Coordinate != END_INSIGNIFICANT_VALUE)
                     nextChunkRead1Coordinate2Min = nextChunkRead1Coordinate2Max = next.read2Coordinate;
                 else {
@@ -172,9 +180,7 @@ public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final
         }
 
         // adjust score
-        if ( md.flowBasedArguments.FLOW_QUALITY_SUM_STRATEGY ) {
-            ends.score = computeFlowDuplicateScore(rec, ends.read2Coordinate);
-        }
+        ends.score = computeFlowDuplicateScore(rec, ends.read2Coordinate);
 
         return ends;
     }
@@ -184,22 +190,53 @@ public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final
      */
     @Override
     public short getReadDuplicateScore(final SAMRecord rec, final ReadEndsForMarkDuplicates pairedEnds) {
-        if (md.flowBasedArguments.FLOW_QUALITY_SUM_STRATEGY ) {
+        if (md.flowBasedArguments.FLOW_MODE){
             return computeFlowDuplicateScore(rec, pairedEnds.read2Coordinate);
         } else {
             return md.getReadDuplicateScore(rec, pairedEnds);
         }
     }
 
+    //This method is identical to MarkDuplicates.areComparableForDuplicates but allows working with readStartUncertainty
+    protected boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs,
+                                                 final boolean compareRead2, final boolean useBarcodes) {
+        boolean areComparable = lhs.libraryId == rhs.libraryId;
+
+        if (useBarcodes && areComparable) { // areComparable is useful here to avoid the casts below
+            final ReadEndsForMarkDuplicatesWithBarcodes lhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) lhs;
+            final ReadEndsForMarkDuplicatesWithBarcodes rhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) rhs;
+            areComparable = lhsWithBarcodes.barcode == rhsWithBarcodes.barcode &&
+                    lhsWithBarcodes.readOneBarcode == rhsWithBarcodes.readOneBarcode &&
+                    lhsWithBarcodes.readTwoBarcode == rhsWithBarcodes.readTwoBarcode;
+        }
+
+        if (areComparable) {
+            areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
+                    lhs.orientation == rhs.orientation;
+        }
+
+        if (areComparable && compareRead2) {
+            areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
+                    lhs.read2Coordinate == rhs.read2Coordinate;
+        }
+
+        return areComparable;
+    }
+
     /**
      * This method is identical in function to areComparableForDuplicates except that it accomodates for
      * the possible significance of the end side of the reads (w/ or wo/ uncertainty). This is only
      * applicable for flow mode invocation.
      */
     private boolean areComparableForDuplicatesWithEndSignificance(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean useBarcodes,
-                                                                  final int lhsRead1Coordinate2Min, final int lhsRead1Coordinate2Max) {
-        boolean areComparable = md.areComparableForDuplicates(lhs, rhs, false, useBarcodes);
+                                                                  final int lhsRead1Coordinate2Min, final int lhsRead1Coordinate2Max,
+                                                                  final int lhsRead1Coordinate1Min, final int lhsRead1Coordinate1Max) {
+        boolean areComparable = areComparableForDuplicates(lhs, rhs, false, useBarcodes);
 
+        if (areComparable) {
+            areComparable = endCoorInRangeWithUncertainty(lhsRead1Coordinate1Min, lhsRead1Coordinate1Max,
+                    rhs.read1Coordinate, md.flowBasedArguments.UNPAIRED_START_UNCERTAINTY);
+        }
         if (areComparable) {
             areComparable = (!endCoorSignificant(lhs.read2Coordinate, rhs.read2Coordinate) ||
                     endCoorInRangeWithUncertainty(lhsRead1Coordinate2Min, lhsRead1Coordinate2Max,
@@ -236,8 +273,8 @@ static protected int getFlowSumOfBaseQualities(final SAMRecord rec, final int th
         final byte[]  bases = rec.getReadBases();
 
         // create iteration range and direction
-        final int startingOffset = !rec.getReadNegativeStrandFlag() ? 0 : bases.length;
-        final int endOffset = !rec.getReadNegativeStrandFlag() ? bases.length : 0;
+        final int startingOffset = !rec.getReadNegativeStrandFlag() ? 0 : bases.length - 1;
+        final int endOffset = !rec.getReadNegativeStrandFlag() ? bases.length : -1;
         final int  iterIncr = !rec.getReadNegativeStrandFlag() ? 1 : -1;
 
         // loop on bases, extract qual related to homopolymer from start of homopolymer
@@ -257,6 +294,55 @@ static protected int getFlowSumOfBaseQualities(final SAMRecord rec, final int th
         return score;
     }
 
+    /**
+     * A quality selection strategy used for flow based reads.
+     *
+     * We look at the bases of the reads that are close to the ends of the fragment
+     * and calculate the minimal quality of the homopolymers.
+     *
+     * @param rec - SAMRecord to get a score for
+     * @param dist - Distance from the end
+     * @return - calculated score (see method description)
+     */
+    static protected int getFlowSumOfBaseQualitiesNearEnds(final SAMRecord rec, int dist) {
+        int score = 100;
+
+        // access qualities and bases
+        final byte[] quals = rec.getBaseQualities();
+        final byte[]  bases = rec.getReadBases();
+
+        boolean insideHpol = false;
+        if (dist > bases.length){
+            dist = bases.length;
+        }
+
+        for ( int i = 0 ; (i < dist) || ( insideHpol ) ; i ++ ) {
+            final byte base = bases[i];
+            if ( (i == bases.length - 1) || ( base != bases[i+1] )) {
+                insideHpol = false;
+            } else {
+                insideHpol = true;
+            }
+
+            if ( quals[i] < score) {
+                score = quals[i];
+            }
+        }
+
+        for ( int i = bases.length-1 ; (i > bases.length - 1 - dist) || ( insideHpol ) ; i -- ) {
+            final byte base = bases[i];
+            if ( (i == 0) || ( base != bases[i - 1] )) {
+                insideHpol = false;
+            } else {
+                insideHpol = true;
+            }
+
+            if ( quals[i] < score) {
+                score = quals[i];
+            }
+        }
+        return score;
+    }
     private short computeFlowDuplicateScore(final SAMRecord rec, final int end) {
 
         if ( end == END_INSIGNIFICANT_VALUE)
@@ -265,8 +351,13 @@ private short computeFlowDuplicateScore(final SAMRecord rec, final int end) {
         Short storedScore = (Short)rec.getTransientAttribute(ATTR_DUPLICATE_SCORE);
         if ( storedScore == null ) {
             short score = 0;
-
-            score += (short) Math.min(getFlowSumOfBaseQualities(rec, md.flowBasedArguments.FLOW_EFFECTIVE_QUALITY_THRESHOLD), Short.MAX_VALUE / 2);
+            if (md.flowBasedArguments.FLOW_DUP_STRATEGY == MarkDuplicatesForFlowArgumentCollection.FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_QUALITY_SUM_STRATEGY) {
+                score += (short) Math.min(getFlowSumOfBaseQualities(rec, md.flowBasedArguments.FLOW_EFFECTIVE_QUALITY_THRESHOLD), Short.MAX_VALUE / 2);
+            } else if (md.flowBasedArguments.FLOW_DUP_STRATEGY == MarkDuplicatesForFlowArgumentCollection.FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_END_QUALITY_STRATEGY) {
+                score += (short) Math.min(getFlowSumOfBaseQualitiesNearEnds(rec, DIST_FROM_END), Short.MAX_VALUE / 2);
+            } else {
+                throw new IllegalArgumentException("Unknown flow duplicate selection strategy: " + md.flowBasedArguments.FLOW_DUP_STRATEGY);
+            }
 
             score += rec.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0;
             storedScore = score;

diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java
@@ -166,14 +166,16 @@ public Object[][] forFlowDataProvider() {
                                 new TestRecordInfo(76, 12,"76M", true, "AAAC", null),
                                 new TestRecordInfo(76, 12, "76M", false, "AACC", null)
                         },
-                        new String[] { "FLOW_QUALITY_SUM_STRATEGY=false" },
+                        new String[] { "FLOW_DUP_STRATEGY=FLOW_QUALITY_SUM_STRATEGY" },
                         new TesterModifier() {
                             @Override
                             public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) {
                                 final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]);
                                 records[0].setAttribute("tp", new int[76]);
                                 records[1].setAttribute("tp", new int[76]);
-                                records[0].getBaseQualities()[1] = 25; // dip inside AAA
+                                records[0].getBaseQualities()[0] = 25; // dip inside AAA
+                                records[0].getBaseQualities()[2] = 25; // dip inside AAA
+
                             }
                         }
                 },
@@ -185,14 +187,33 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester)
                                 new TestRecordInfo(76, 12,"76M", false, "AAAC", null),
                                 new TestRecordInfo(76, 12, "76M", true, "AACC", null)
                         },
-                        new String[] { "FLOW_QUALITY_SUM_STRATEGY=true" },
+                        new String[] { "FLOW_DUP_STRATEGY=FLOW_QUALITY_SUM_STRATEGY" },
+                        new TesterModifier() {
+                            @Override
+                            public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) {
+                                final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]);
+                                records[0].setAttribute("tp", new int[76]);
+                                records[1].setAttribute("tp", new int[76]);
+                                records[0].getBaseQualities()[1] = 25; // dip inside AAA
+                            }
+                        }
+                },
+                // testFLOW_END_QUALITY_STRATEGY: flow (homopolymer based) minumum
+                {
+                        DuplicateScoringStrategy.ScoringStrategy.SUM_OF_BASE_QUALITIES,
+                        new TestRecordInfo[] {
+                                new TestRecordInfo(76, 12,"76M", true, "AAAC", null),
+                                new TestRecordInfo(76, 12, "76M", false, "AACC", null)
+                        },
+                        new String[] { "FLOW_DUP_STRATEGY=FLOW_END_QUALITY_STRATEGY" },
                         new TesterModifier() {
                             @Override
                             public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) {
                                 final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]);
                                 records[0].setAttribute("tp", new int[76]);
                                 records[1].setAttribute("tp", new int[76]);
                                 records[0].getBaseQualities()[1] = 25; // dip inside AAA
+                                records[1].getBaseQualities()[30] = 10;
                             }
                         }
                 },
@@ -208,6 +229,17 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester)
                         new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_END_UNCERTAINTY=10" }, null
                 },
 
+                // testUNPAIRED_START_UNCERTAINTY: End location is significant and uncertain, end sorted
+                {
+                        null,
+                        new TestRecordInfo[] {
+                                new TestRecordInfo(74, 12, null, false, null, null),
+                                new TestRecordInfo(64, 22, null, true, null, null),
+                                new TestRecordInfo(54, 32, null, true, null, null)
+                        },
+                        new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_START_UNCERTAINTY=10" }, null
+                },
+
                 // testUNPAIRED_END_UNCERTAINTY: End location is significant and uncertain, end not sorted
                 {
                         null,
@@ -267,7 +299,7 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester)
     }
 
     @Test(dataProvider = "forFlowDataProvider")
-    public void testForFlow(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy, final TestRecordInfo[] recInfos, final String[] params, TesterModifier modifier) {
+    public void testForFlowMDCall(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy, final TestRecordInfo[] recInfos, final String[] params, TesterModifier modifier) {
 
         // get tester, build records
         final AbstractMarkDuplicatesCommandLineProgramTester tester =
@@ -339,4 +371,33 @@ public void testGetFlowSumOfBaseQualities(final String bases, final byte[] quals
         Assert.assertEquals(score, expectedScore);
     }
 
+    @DataProvider(name ="getFlowEndBaseQualitiesDataProvider")
+    public Object[][] getFlowEndBaseQualitiesDataProvider() {
+        return new Object[][] {
+                { "AAAA", new byte[] {50,50,50,50}, 2, 50 },
+                { "AAAA", new byte[] {50,10,10,50}, 4, 10 },
+                { "ACCA", new byte[] {20,10,10,20}, 1, 20 },
+                { "AABBCC", new byte[] {50,50,10,10,40,40}, 30, 10 },
+        };
+    }
+
+    @Test(dataProvider = "getFlowEndBaseQualitiesDataProvider")
+    public void testGetFlowEndBaseQualities(final String bases, final byte[] quals, final int threshold, final int expectedScore) {
+
+        // build record
+        final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
+        tester.getSamRecordSetBuilder().setReadLength(bases.length());
+        tester.addMappedFragment(0, 12, false, 50);
+
+        // install bases and quals
+        final SAMRecord rec = tester.getSamRecordSetBuilder().getRecords().iterator().next();
+        System.arraycopy(bases.getBytes(), 0, rec.getReadBases(), 0,bases.length());
+        System.arraycopy(quals, 0, rec.getBaseQualities(), 0, quals.length);
+
+        // calculate score
+        final int score = MarkDuplicatesForFlowHelper.getFlowSumOfBaseQualitiesNearEnds(rec, threshold);
+        Assert.assertEquals(score, expectedScore);
+    }
+
+
 }