Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CollectHSMetrics - Don't use Coverage Cap on fold score #1913

Merged
46 changes: 23 additions & 23 deletions src/main/java/picard/analysis/directed/TargetMetricsCollector.java
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri

// histogram of base qualities. includes all but quality 2 bases. we use this histogram to calculate theoretical het sensitivity.
private final Histogram<Integer> unfilteredBaseQHistogram = new Histogram<>("baseq", "unfiltered_baseq_count");
private final Histogram<Integer> unCappedUnfilteredBaseQHistogram = new Histogram<>("baseq", "unfiltered_baseq_count");

private static final double LOG_ODDS_THRESHOLD = 3.0;

Expand Down Expand Up @@ -367,13 +368,17 @@ public class PerUnitTargetMetricCollector implements PerUnitMetricCollector<METR
private File perBaseOutput;

final long[] baseQHistogramArray = new long[Byte.MAX_VALUE];
final long[] unCappedBaseQHistogramArray = new long[Byte.MAX_VALUE];

// A Map to accumulate per-bait-region (i.e. merge of overlapping targets) coverage
// excludes bases with qualities lower than minimumBaseQuality (default 20)
private final Map<Interval, Coverage> highQualityCoverageByTarget;

// only excludes bases with quality 2. collected for theoretical set sensitivity
private final Map<Interval, Coverage> unfilteredCoverageByTarget;

private long hqMaxDepth = 0;

private final TargetMetrics metrics = new TargetMetrics();
private final int minimumBaseQuality;
private final CountingAdapterFilter adapterFilter;
Expand Down Expand Up @@ -631,11 +636,13 @@ public void acceptRecord(final SAMRecord record) {
if (ufCoverage.getDepths()[targetOffset] <= coverageCap) {
baseQHistogramArray[qual]++;
}
unCappedBaseQHistogramArray[qual]++;

// Then filtered
if (highQual) {
final Coverage hqCoverage = highQualityCoverageByTarget.get(target);
hqCoverage.addBase(targetOffset);
hqMaxDepth = Math.max(hqMaxDepth, hqCoverage.getDepths()[targetOffset]);

if (coveredTargets.add(target)) {
hqCoverage.incrementReadCount();
Expand Down Expand Up @@ -699,16 +706,14 @@ public void finish() {

/** Calculates how much additional sequencing is needed to raise 80% of bases to the mean for the lane. */
private void calculateTargetCoverageMetrics() {
final long[] highQualityCoverageHistogramArray = new long[coverageCap+1];

LongStream.range(0, hqMaxDepth).forEach(i -> highQualityDepthHistogram.increment((int) i, 0));

int zeroCoverageTargets = 0;

// the number of bases we counted towards the depth histogram plus those that got thrown out by the coverage cap
long totalCoverage = 0;

// the maximum depth at any target base
long maxDepth = 0;

// the minimum depth at any target base
long minDepth = Long.MAX_VALUE;

// The "how many target bases at at-least X" calculations.
Expand All @@ -723,16 +728,15 @@ private void calculateTargetCoverageMetrics() {
for (final Coverage c : this.highQualityCoverageByTarget.values()) {
if (!c.hasCoverage()) {
zeroCoverageTargets++;
highQualityCoverageHistogramArray[0] += c.interval.length();
highQualityDepthHistogram.increment(0, c.interval.length());
targetBases[0] += c.interval.length();
minDepth = 0;
continue;
}

for (final int depth : c.getDepths()) {
totalCoverage += depth;
highQualityCoverageHistogramArray[Math.min(depth, coverageCap)]++;
maxDepth = Math.max(maxDepth, depth);
highQualityDepthHistogram.increment(depth, 1);
minDepth = Math.min(minDepth, depth);

// Add to the "how many target bases at at-least X" calculations.
Expand All @@ -747,16 +751,11 @@ private void calculateTargetCoverageMetrics() {
throw new PicardException("the number of target bases with at least 0x coverage does not equal the number of target bases");
}

for (int i = 0; i < highQualityCoverageHistogramArray.length; ++i) {
highQualityDepthHistogram.increment(i, highQualityCoverageHistogramArray[i]);
}

// we do this instead of highQualityDepthHistogram.getMean() because the histogram imposes a coverage cap
metrics.MEAN_TARGET_COVERAGE = (double) totalCoverage / metrics.TARGET_TERRITORY;
metrics.MEDIAN_TARGET_COVERAGE = highQualityDepthHistogram.getMedian();
metrics.MAX_TARGET_COVERAGE = maxDepth;
metrics.MAX_TARGET_COVERAGE = hqMaxDepth;
// Use Math.min() to account for edge case where highQualityCoverageByTarget is empty (minDepth=Long.MAX_VALUE)
metrics.MIN_TARGET_COVERAGE = Math.min(minDepth, maxDepth);
metrics.MIN_TARGET_COVERAGE = Math.min(minDepth, hqMaxDepth);

// compute the coverage value such that 80% of target bases have better coverage than it i.e. 20th percentile
// this roughly measures how much we must sequence extra such that 80% of target bases have coverage at least as deep as the current mean coverage
Expand All @@ -783,32 +782,33 @@ private void calculateTargetCoverageMetrics() {
metrics.PCT_TARGET_BASES_100000X = (double) targetBases[17] / (double) targetBases[0];
}


private void calculateTheoreticalHetSensitivity(){
final long[] unfilteredDepthHistogramArray = new long[coverageCap + 1];
LongStream.range(0, coverageCap).forEach(i -> unfilteredDepthHistogram.increment((int) i, 0));

// collect the unfiltered coverages (i.e. only quality 2 bases excluded) for all targets into a histogram array
for (final Coverage c : this.unfilteredCoverageByTarget.values()) {
if (!c.hasCoverage()) {
unfilteredDepthHistogramArray[0] += c.interval.length();
unfilteredDepthHistogram.increment(0, c.interval.length());
continue;
}

for (final int depth : c.getDepths()) {
unfilteredDepthHistogramArray[Math.min(depth, coverageCap)]++;
unfilteredDepthHistogram.increment(Math.min(depth, coverageCap), 1);
}
}

if (LongStream.of(baseQHistogramArray).sum() != LongStream.rangeClosed(0, coverageCap).map(i -> i * unfilteredDepthHistogramArray[(int)i]).sum()) {
if (LongStream.of(baseQHistogramArray).sum() != unfilteredDepthHistogram.getSum()) {
throw new PicardException("numbers of bases in the base quality histogram and the coverage histogram are not equal");
}

// TODO: normalize the arrays directly. then we don't have to convert to Histograms
//TODO this is used elsewhere, so can't remove from here just yet - consider moving to accessor.
for (int i=0; i<baseQHistogramArray.length; ++i) {
unfilteredBaseQHistogram.increment(i, baseQHistogramArray[i]);
}

for (int i = 0; i < unfilteredDepthHistogramArray.length; i++){
unfilteredDepthHistogram.increment(i, unfilteredDepthHistogramArray[i]);
for (int i=0; i<unCappedBaseQHistogramArray.length; ++i) {
unCappedUnfilteredBaseQHistogram.increment(i, unCappedBaseQHistogramArray[i]);
}

final double [] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(unfilteredDepthHistogram);
Expand Down Expand Up @@ -942,7 +942,7 @@ private void calculateGcMetrics() {
public void addMetricsToFile(final MetricsFile<METRIC_TYPE, Integer> hsMetricsComparableMetricsFile) {
hsMetricsComparableMetricsFile.addMetric(convertMetric(this.metrics));
hsMetricsComparableMetricsFile.addHistogram(highQualityDepthHistogram);
hsMetricsComparableMetricsFile.addHistogram(unfilteredBaseQHistogram);
hsMetricsComparableMetricsFile.addHistogram(unCappedUnfilteredBaseQHistogram);
}
}

Expand Down
35 changes: 35 additions & 0 deletions src/test/java/picard/analysis/directed/CollectHsMetricsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,41 @@ public void testHsMetricsHandlesIndelsAppropriately() throws IOException {
Assert.assertEquals(insWithIndelHandling.PCT_USABLE_BASES_ON_TARGET, 1.0d); // inserted bases are counted as on target
}

@Test
public void testHsMetricsF80DoesNotUseCovCap() throws IOException {
final SAMRecordSetBuilder highCoverage = new SAMRecordSetBuilder(true, SortOrder.coordinate);
JoeVieira marked this conversation as resolved.
Show resolved Hide resolved
final IntervalList targets = new IntervalList(highCoverage.getHeader());
final IntervalList baits = new IntervalList(highCoverage.getHeader());
targets.add(new Interval("chr1", 1000, 1199, false, "t1"));
baits.add(new Interval("chr1", 950, 1049, false, "b1"));
baits.add(new Interval("chr1", 1050, 1149, false, "b2"));
baits.add(new Interval("chr1", 1150, 1249, false, "b3"));

// Generate 100000 reads that fully cover the target in each BAM
for (int i=0; i<100000; ++i) {
highCoverage.addFrag( "r" + i, 0, 1000, false, false, "100M100M", null, 30);
}

// Write things out to file
final File dir = IOUtil.createTempDir("hsmetrics.test").toFile();
final File bs = new File(dir, "baits.interval_list").getAbsoluteFile();
final File ts = new File(dir, "targets.interval_list").getAbsoluteFile();
baits.write(bs);
targets.write(ts);
final File withHighCovBam = writeBam(highCoverage, new File(dir, "fold_80.bam"));

// Now run CollectHsMetrics
final File out = Files.createTempFile("hsmetrics_high_coverage.", ".txt").toFile();
runPicardCommandLine(Arrays.asList("COVERAGE_CAP=10", "SAMPLE_SIZE=0", "TI="+ts.getPath(), "BI="+bs.getPath(), "O="+out.getPath(), "I="+withHighCovBam.getAbsolutePath()));
final HsMetrics highCoverageMetrics = readMetrics(out);

IOUtil.deleteDirectoryTree(dir);
// actual coverage should not be impacted by coverage_cap
Assert.assertEquals(highCoverageMetrics.MEAN_TARGET_COVERAGE, 100000);
Assert.assertEquals(highCoverageMetrics.MEDIAN_TARGET_COVERAGE, 100000);
Assert.assertEquals(highCoverageMetrics.FOLD_80_BASE_PENALTY, 1);
}


@Test
public void testHsMetricsHighTargetCoverage() throws IOException {
Expand Down
Loading