Skip to content

Commit

Permalink
Fix CollectHSMetrics - Don't use Coverage Cap on fold score (#1913)
Browse files Browse the repository at this point in the history
* No coverage cap on Histogram ArrayList used for statistics on actual coverage metrics

* Use hashmap for sparse array structure

* Test coverage to ensure Fold80 & median coverage are not impacted by Coverage Cap - per documentation

* Using histogram directly

* Normalize Depth Array directly & test

* Create unfiltered, capped array

* Create unfiltered, uncapped histogram for output

* Close TODO of normalizing array directly, to avoid the extra flip between array & histogram

* Remove tests for normalizeDepthArray

* Remove normalizeDepthArray

* Build uncapped data for outputting histograms, keep capped data for theoretical stats, move calculation of min / max depth into base loading, ensure histograms are not sparsely populated

* Remove longstream import

* Clean up extra whitespace

* Remove uncappedunfiltered, it's not needed

* Clean up unneeded properties
  • Loading branch information
JoeVieira authored Feb 15, 2024
1 parent c026f94 commit 6d33c94
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 23 deletions.
46 changes: 23 additions & 23 deletions src/main/java/picard/analysis/directed/TargetMetricsCollector.java
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri

// histogram of base qualities. includes all but quality 2 bases. we use this histogram to calculate theoretical het sensitivity.
private final Histogram<Integer> unfilteredBaseQHistogram = new Histogram<>("baseq", "unfiltered_baseq_count");
private final Histogram<Integer> unCappedUnfilteredBaseQHistogram = new Histogram<>("baseq", "unfiltered_baseq_count");

private static final double LOG_ODDS_THRESHOLD = 3.0;

Expand Down Expand Up @@ -367,13 +368,17 @@ public class PerUnitTargetMetricCollector implements PerUnitMetricCollector<METR
private File perBaseOutput;

final long[] baseQHistogramArray = new long[Byte.MAX_VALUE];
final long[] unCappedBaseQHistogramArray = new long[Byte.MAX_VALUE];

// A Map to accumulate per-bait-region (i.e. merge of overlapping targets) coverage
// excludes bases with qualities lower than minimumBaseQuality (default 20)
private final Map<Interval, Coverage> highQualityCoverageByTarget;

// only excludes bases with quality 2. collected for theoretical set sensitivity
private final Map<Interval, Coverage> unfilteredCoverageByTarget;

private long hqMaxDepth = 0;

private final TargetMetrics metrics = new TargetMetrics();
private final int minimumBaseQuality;
private final CountingAdapterFilter adapterFilter;
Expand Down Expand Up @@ -631,11 +636,13 @@ public void acceptRecord(final SAMRecord record) {
if (ufCoverage.getDepths()[targetOffset] <= coverageCap) {
baseQHistogramArray[qual]++;
}
unCappedBaseQHistogramArray[qual]++;

// Then filtered
if (highQual) {
final Coverage hqCoverage = highQualityCoverageByTarget.get(target);
hqCoverage.addBase(targetOffset);
hqMaxDepth = Math.max(hqMaxDepth, hqCoverage.getDepths()[targetOffset]);

if (coveredTargets.add(target)) {
hqCoverage.incrementReadCount();
Expand Down Expand Up @@ -699,16 +706,14 @@ public void finish() {

/** Calculates how much additional sequencing is needed to raise 80% of bases to the mean for the lane. */
private void calculateTargetCoverageMetrics() {
final long[] highQualityCoverageHistogramArray = new long[coverageCap+1];

LongStream.range(0, hqMaxDepth).forEach(i -> highQualityDepthHistogram.increment((int) i, 0));

int zeroCoverageTargets = 0;

// the number of bases we counted towards the depth histogram plus those that got thrown out by the coverage cap
long totalCoverage = 0;

// the maximum depth at any target base
long maxDepth = 0;

// the minimum depth at any target base
long minDepth = Long.MAX_VALUE;

// The "how many target bases at at-least X" calculations.
Expand All @@ -723,16 +728,15 @@ private void calculateTargetCoverageMetrics() {
for (final Coverage c : this.highQualityCoverageByTarget.values()) {
if (!c.hasCoverage()) {
zeroCoverageTargets++;
highQualityCoverageHistogramArray[0] += c.interval.length();
highQualityDepthHistogram.increment(0, c.interval.length());
targetBases[0] += c.interval.length();
minDepth = 0;
continue;
}

for (final int depth : c.getDepths()) {
totalCoverage += depth;
highQualityCoverageHistogramArray[Math.min(depth, coverageCap)]++;
maxDepth = Math.max(maxDepth, depth);
highQualityDepthHistogram.increment(depth, 1);
minDepth = Math.min(minDepth, depth);

// Add to the "how many target bases at at-least X" calculations.
Expand All @@ -747,16 +751,11 @@ private void calculateTargetCoverageMetrics() {
throw new PicardException("the number of target bases with at least 0x coverage does not equal the number of target bases");
}

for (int i = 0; i < highQualityCoverageHistogramArray.length; ++i) {
highQualityDepthHistogram.increment(i, highQualityCoverageHistogramArray[i]);
}

// we do this instead of highQualityDepthHistogram.getMean() because the histogram imposes a coverage cap
metrics.MEAN_TARGET_COVERAGE = (double) totalCoverage / metrics.TARGET_TERRITORY;
metrics.MEDIAN_TARGET_COVERAGE = highQualityDepthHistogram.getMedian();
metrics.MAX_TARGET_COVERAGE = maxDepth;
metrics.MAX_TARGET_COVERAGE = hqMaxDepth;
// Use Math.min() to account for edge case where highQualityCoverageByTarget is empty (minDepth=Long.MAX_VALUE)
metrics.MIN_TARGET_COVERAGE = Math.min(minDepth, maxDepth);
metrics.MIN_TARGET_COVERAGE = Math.min(minDepth, hqMaxDepth);

// compute the coverage value such that 80% of target bases have better coverage than it i.e. 20th percentile
// this roughly measures how much we must sequence extra such that 80% of target bases have coverage at least as deep as the current mean coverage
Expand All @@ -783,32 +782,33 @@ private void calculateTargetCoverageMetrics() {
metrics.PCT_TARGET_BASES_100000X = (double) targetBases[17] / (double) targetBases[0];
}


private void calculateTheoreticalHetSensitivity(){
final long[] unfilteredDepthHistogramArray = new long[coverageCap + 1];
LongStream.range(0, coverageCap).forEach(i -> unfilteredDepthHistogram.increment((int) i, 0));

// collect the unfiltered coverages (i.e. only quality 2 bases excluded) for all targets into a histogram array
for (final Coverage c : this.unfilteredCoverageByTarget.values()) {
if (!c.hasCoverage()) {
unfilteredDepthHistogramArray[0] += c.interval.length();
unfilteredDepthHistogram.increment(0, c.interval.length());
continue;
}

for (final int depth : c.getDepths()) {
unfilteredDepthHistogramArray[Math.min(depth, coverageCap)]++;
unfilteredDepthHistogram.increment(Math.min(depth, coverageCap), 1);
}
}

if (LongStream.of(baseQHistogramArray).sum() != LongStream.rangeClosed(0, coverageCap).map(i -> i * unfilteredDepthHistogramArray[(int)i]).sum()) {
if (LongStream.of(baseQHistogramArray).sum() != unfilteredDepthHistogram.getSum()) {
throw new PicardException("numbers of bases in the base quality histogram and the coverage histogram are not equal");
}

// TODO: normalize the arrays directly. then we don't have to convert to Histograms
//TODO this is used elsewhere, so can't remove from here just yet - consider moving to accessor.
for (int i=0; i<baseQHistogramArray.length; ++i) {
unfilteredBaseQHistogram.increment(i, baseQHistogramArray[i]);
}

for (int i = 0; i < unfilteredDepthHistogramArray.length; i++){
unfilteredDepthHistogram.increment(i, unfilteredDepthHistogramArray[i]);
for (int i=0; i<unCappedBaseQHistogramArray.length; ++i) {
unCappedUnfilteredBaseQHistogram.increment(i, unCappedBaseQHistogramArray[i]);
}

final double [] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(unfilteredDepthHistogram);
Expand Down Expand Up @@ -942,7 +942,7 @@ private void calculateGcMetrics() {
public void addMetricsToFile(final MetricsFile<METRIC_TYPE, Integer> hsMetricsComparableMetricsFile) {
hsMetricsComparableMetricsFile.addMetric(convertMetric(this.metrics));
hsMetricsComparableMetricsFile.addHistogram(highQualityDepthHistogram);
hsMetricsComparableMetricsFile.addHistogram(unfilteredBaseQHistogram);
hsMetricsComparableMetricsFile.addHistogram(unCappedUnfilteredBaseQHistogram);
}
}

Expand Down
35 changes: 35 additions & 0 deletions src/test/java/picard/analysis/directed/CollectHsMetricsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,41 @@ public void testHsMetricsHandlesIndelsAppropriately() throws IOException {
Assert.assertEquals(insWithIndelHandling.PCT_USABLE_BASES_ON_TARGET, 1.0d); // inserted bases are counted as on target
}

@Test
public void testHsMetricsF80DoesNotUseCovCap() throws IOException {
final SAMRecordSetBuilder highCoverage = new SAMRecordSetBuilder(true, SortOrder.coordinate);
final IntervalList targets = new IntervalList(highCoverage.getHeader());
final IntervalList baits = new IntervalList(highCoverage.getHeader());
targets.add(new Interval("chr1", 1000, 1199, false, "t1"));
baits.add(new Interval("chr1", 950, 1049, false, "b1"));
baits.add(new Interval("chr1", 1050, 1149, false, "b2"));
baits.add(new Interval("chr1", 1150, 1249, false, "b3"));

// Generate 100000 reads that fully cover the target in each BAM
for (int i=0; i<100000; ++i) {
highCoverage.addFrag( "r" + i, 0, 1000, false, false, "100M100M", null, 30);
}

// Write things out to file
final File dir = IOUtil.createTempDir("hsmetrics.test").toFile();
final File bs = new File(dir, "baits.interval_list").getAbsoluteFile();
final File ts = new File(dir, "targets.interval_list").getAbsoluteFile();
baits.write(bs);
targets.write(ts);
final File withHighCovBam = writeBam(highCoverage, new File(dir, "fold_80.bam"));

// Now run CollectHsMetrics
final File out = Files.createTempFile("hsmetrics_high_coverage.", ".txt").toFile();
runPicardCommandLine(Arrays.asList("COVERAGE_CAP=10", "SAMPLE_SIZE=0", "TI="+ts.getPath(), "BI="+bs.getPath(), "O="+out.getPath(), "I="+withHighCovBam.getAbsolutePath()));
final HsMetrics highCoverageMetrics = readMetrics(out);

IOUtil.deleteDirectoryTree(dir);
// actual coverage should not be impacted by coverage_cap
Assert.assertEquals(highCoverageMetrics.MEAN_TARGET_COVERAGE, 100000);
Assert.assertEquals(highCoverageMetrics.MEDIAN_TARGET_COVERAGE, 100000);
Assert.assertEquals(highCoverageMetrics.FOLD_80_BASE_PENALTY, 1);
}


@Test
public void testHsMetricsHighTargetCoverage() throws IOException {
Expand Down

0 comments on commit 6d33c94

Please sign in to comment.