Skip to content

Commit

Permalink
Fix EdgeReadIterator (#1616)
Browse files Browse the repository at this point in the history
* Fixing bug in EdgeReadIterator

- Adding tests to catch previously faulty behavior
- Also fixing other bugs regarding sorting of IntervalLists in AbstractLocusIterator and IntervalListReferenceSequenceMask
  • Loading branch information
michaelgatzen authored Sep 21, 2022
1 parent d15a5ba commit 347c0ac
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 71 deletions.
25 changes: 12 additions & 13 deletions src/main/java/htsjdk/samtools/util/AbstractLocusIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,6 @@ public abstract class AbstractLocusIterator<T extends AbstractRecordAndOffset, K

private final LocusComparator<Locus> locusComparator = new LocusComparator<>();

/**
* Last processed interval, relevant only if list of intervals is defined.
*/
private int lastInterval = 0;



public SAMFileHeader getHeader() {
Expand All @@ -159,7 +154,7 @@ public SAMFileHeader getHeader() {
*
* @param samReader must be coordinate sorted
* @param intervalList Either the list of desired intervals, or null. Note that if an intervalList is
* passed in that is not coordinate sorted, it will eventually be coordinated sorted by this class.
* passed in that is not coordinate sorted, it will eventually be coordinated sorted here.
* @param useIndex If true, do indexed lookup to improve performance. Not relevant if intervalList == null.
* It is no longer the case the useIndex==true can make performance worse. It should always perform at least
* as well as useIndex==false, and generally will be much faster.
Expand All @@ -176,8 +171,14 @@ public AbstractLocusIterator(final SamReader samReader, final IntervalList inter
this.samReader = samReader;
this.useIndex = useIndex;
if (intervalList != null) {
intervals = intervalList.uniqued().getIntervals();
this.referenceSequenceMask = new IntervalListReferenceSequenceMask(intervalList);
if (!intervalList.getHeader().getSequenceDictionary().isSameDictionary(getHeader().getSequenceDictionary())) {
throw new SAMException("The sequence dictionary of the interval list file differs from the sequence dictionary of the input SAM file.");
}

final boolean intervalListIsSorted = intervalList.getHeader().getSortOrder() == SAMFileHeader.SortOrder.coordinate;
final IntervalList sortedIntervalList = intervalListIsSorted ? intervalList : intervalList.sorted();
intervals = sortedIntervalList.uniqued().getIntervals();
this.referenceSequenceMask = new IntervalListReferenceSequenceMask(sortedIntervalList);
} else {
intervals = null;
this.referenceSequenceMask = new WholeGenomeReferenceSequenceMask(samReader.getFileHeader());
Expand Down Expand Up @@ -264,6 +265,9 @@ private boolean hasRemainingMaskBases() {
*/
@Override
public K next() {
if (this.samIterator == null) {
iterator();
}
// if we don't have any completed entries to return, try and make some!
while (complete.isEmpty() && samHasMore()) {
final SAMRecord rec = samIterator.peek();
Expand Down Expand Up @@ -576,11 +580,6 @@ protected List<Interval> getIntervals() {
return intervals;
}

protected Interval getCurrentInterval() {
if (intervals == null) return null;
return intervals.get(lastInterval);
}

public boolean isIncludeIndels() {
return includeIndels;
}
Expand Down
176 changes: 127 additions & 49 deletions src/main/java/htsjdk/samtools/util/EdgeReadIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@

package htsjdk.samtools.util;

import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.*;

/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis.
Expand All @@ -42,6 +39,12 @@
*
*/
public class EdgeReadIterator extends AbstractLocusIterator<EdgingRecordAndOffset, AbstractLocusInfo<EdgingRecordAndOffset>> {
// These variables are required to perform the detection of overlap between reads and intervals
private Interval currentInterval = null;
private final PeekableIterator<Interval> intervalListIterator;
private final IntervalCoordinateComparator intervalCoordinateComparator;

private final OverlapDetector<Interval> overlapDetector;

/**
* Prepare to iterate through the given SAM records, skipping non-primary alignments. Do not use
Expand All @@ -58,7 +61,7 @@ public EdgeReadIterator(final SamReader samReader) {
*
* @param samReader must be coordinate sorted
* @param intervalList Either the list of desired intervals, or null. Note that if an intervalList is
* passed in that is not coordinate sorted, it will eventually be coordinated sorted by this class.
* passed in that is not coordinate sorted, it will be coordinated sorted by this class.
*/
public EdgeReadIterator(final SamReader samReader, final IntervalList intervalList) {
this(samReader, intervalList, samReader.hasIndex());
Expand All @@ -69,13 +72,52 @@ public EdgeReadIterator(final SamReader samReader, final IntervalList intervalLi
*
* @param samReader must be coordinate sorted
* @param intervalList Either the list of desired intervals, or null. Note that if an intervalList is
* passed in that is not coordinate sorted, it will eventually be coordinated sorted by this class.
* passed in that is not coordinate sorted, it will be coordinated sorted here.
* @param useIndex If true, do indexed lookup to improve performance. Not relevant if intervalList == null.
* It is no longer the case the useIndex==true can make performance worse. It should always perform at least
* as well as useIndex==false, and generally will be much faster.
*/
public EdgeReadIterator(final SamReader samReader, final IntervalList intervalList, final boolean useIndex) {
super(samReader, intervalList, useIndex);

if (getIntervals() == null) {
intervalListIterator = null;
intervalCoordinateComparator = null;
overlapDetector = null;
} else {
// For the easy case of a read being fully contained in an interval, we use an iterator to keep track
// of the current interval.
intervalListIterator = new PeekableIterator<>(getIntervals().iterator());
if (intervalListIterator.hasNext()) {
currentInterval = intervalListIterator.next();
}
// We also need this comparator to keep track of the different contigs. The constructor of
// AbstractLocusInfo ensures that the sequence dictionaries of the SAM file and the interval list match.
intervalCoordinateComparator = new IntervalCoordinateComparator(getHeader());

// For more complicated cases, we need an OverlapDetector.
overlapDetector = OverlapDetector.create(getIntervals());
}
}

/**
* This function updates currentInterval according to the position of the record that it is
* presented and determines if the current read is fully contained in the currentInterval.
* @param rec The record we want to consider
* @return True, if rec is fully contained in the current interval, otherwise false
*/
protected boolean advanceCurrentIntervalAndCheckIfIntervalContainsRead(final SAMRecord rec) {
// currentInterval should never be null when calling this method, but we have to check it just to make sure,
// so that we don't get a NullPointerException in the return statement.
if (currentInterval == null) {
return false;
}
// Here we need to update the currentInterval. We have to do this using an
// IntervalCoordinateComparator to take factor in the order in the sequence dictionary.
while (intervalListIterator.peek() != null && intervalCoordinateComparator.compare(new Interval(rec), intervalListIterator.peek()) > 0) {
currentInterval = intervalListIterator.next();
}
return currentInterval.contains(rec);
}

/**
Expand All @@ -88,73 +130,109 @@ public EdgeReadIterator(final SamReader samReader, final IntervalList intervalLi
*/
@Override
void accumulateSamRecord(SAMRecord rec) {
// In the case that no intervals are passed, or that the current interval completely contains
// the current read (which is the most common case for WGS), set needToConsiderIntervals to false, so we don't
// have to find intersections and can later emit the read right away.
final boolean needToConsiderIntervals = intervals != null && !advanceCurrentIntervalAndCheckIfIntervalContainsRead(rec);

// interpret the CIGAR string and add the base info
for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) {
// General concept
//
// Example: Read (or more accurately, AlignmentBlock) from position 101 to 108
//
// accumulator (ArrayList<LocusInfo>) 30 31 32 33 34 35 36 37 38 (has one LocusInfo at each position, corresponding to the genomic position)
// LocusInfo objects (with genomic pos.) 100 101 102 103 104 105 106 107 108 (the LocusInfo objects can contain EdgingRecordAndOffset objects, if a record starts or ends there)
// ^ ^
// | |
// EdgingRecordAndOffset objects B E

// Steps:
// 1. Fill accumulator with enough LocusInfos to the end of the read
// 2. Determine which parts of the read are covered by the specified intervals
// (this step will be skipped if the read is completely contained within the
// interval, i.e. needToConsiderIntervals is false, otherwise each overlap
// will be considered individually)
// 2. Create a EdgingRecordAndOffset object with the type BEGIN and add it
// to the LocusInfo at the position in the accumulator corresponding to
// the start position of the read (or overlap)
// 3. Create a EdgingRecordAndOffset object with the type END and add it
// to the LocusInfo at the position in the accumulator corresponding to
// the end position of the read (or overlap)

// 0-based offset into the read of the current base
final int readOffset = alignmentBlock.getReadStart() - 1;
final int offsetStartOfAlignmentBlockInRead = alignmentBlock.getReadStart() - 1;
// 1-based reference position that the current base aligns to
final int refPos = alignmentBlock.getReferenceStart();
final int referencePositionStartOfAlignmentBlock = alignmentBlock.getReferenceStart();

// Here we add the first entry to the accumulator, which is the start of this AlignmentBlock.
if (accumulator.isEmpty()) {
accumulator.add(createLocusInfo(getReferenceSequence(rec.getReferenceIndex()), rec.getAlignmentStart()));
}

// The accumulator should always have LocusInfos that correspond to one consecutive segment of loci from one reference
// sequence. So
// The accumulator should always have LocusInfos that correspond to one consecutive segment of loci from
// one reference sequence. So
// accumulator.get(0).getPosition() + accumulator.size() == accumulator.get(accumulator.size()-1).getPosition()+1
final int accumulatorNextPosition = accumulator.get(0).getPosition() + accumulator.size();

if (accumulatorNextPosition != accumulator.get(accumulator.size() - 1).getPosition() + 1) {
throw new IllegalStateException("The accumulator has gotten into a funk. Cannot continue");
}

// Ensure there are AbstractLocusInfos up to and including this position
for (int locusPos = accumulatorNextPosition; locusPos <= refPos + alignmentBlock.getLength(); ++locusPos) {
// Ensure there are consecutive AbstractLocusInfos up to and including the end of the AlignmentBlock
for (int locusPos = accumulatorNextPosition; locusPos <= referencePositionStartOfAlignmentBlock + alignmentBlock.getLength(); ++locusPos) {
accumulator.add(createLocusInfo(getReferenceSequence(rec.getReferenceIndex()), locusPos));
}

/* Let's assume an alignment block starts in some locus.
* We put two records to the accumulator. The first one has the "begin" type which corresponds to the locus
* where the block starts. The second one has the "end" type which corresponds to the other locus where the block ends.
*/
// Let's assume an alignment block starts in some locus.
// We put two records to the accumulator. The first one has the "begin" type which corresponds to the locus
// where the block starts. The second one has the "end" type which corresponds to the other locus where the block ends.

// 0-based offset from the aligned position of the first base in the read to the aligned position
// of the current base.
int refOffsetInterval = refPos - rec.getAlignmentStart(); // corresponds to the beginning of the alignment block
int refOffsetEndInterval = refOffsetInterval + alignmentBlock.getLength();;
int startShift = 0;

// intersect intervals and alignment block
if (getIntervals() != null) {
// get the current interval we're processing
Interval interval = getCurrentInterval();
if (interval != null) {
final int intervalEnd = interval.getEnd();
final int intervalStart = interval.getStart();
// check if an interval and the alignment block overlap
if (!CoordMath.overlaps(refPos, refPos + alignmentBlock.getLength(), intervalStart, intervalEnd)) {
continue;
}
// if the alignment block starts out of an interval, shift the starting position
if (refPos < intervalStart) {
startShift = intervalStart - refPos;
refOffsetInterval = refOffsetInterval + startShift;
}
// if the alignment block ends out of an interval, shift the ending position
final int readEnd = refPos + alignmentBlock.getLength();
if (readEnd > intervalEnd) {
refOffsetEndInterval = refOffsetEndInterval - (readEnd - intervalEnd) + 1;
}
final int offsetStartOfAlignmentBlockOnReference = referencePositionStartOfAlignmentBlock - rec.getAlignmentStart();
// Similar for the end of the alignment block. We can simply add the length of the block, since by
// definition all bases in an AlignmentBlock match the reference alignment
final int offsetEndOfAlignmentBlockOnReference = offsetStartOfAlignmentBlockOnReference + alignmentBlock.getLength();

if (needToConsiderIntervals) {
// If the read isn't fully contained within the currentInterval, we need to manually handle each of the overlaps.

for (final Interval interval : overlapDetector.getOverlaps(new Interval(rec.getContig(), referencePositionStartOfAlignmentBlock, referencePositionStartOfAlignmentBlock + alignmentBlock.getLength()))) {
// In case the start position is smaller than the start of the interval, we need to determine the offset (we need this later)...
final int offsetStartOfIntervalInAlignmentBlock = referencePositionStartOfAlignmentBlock < interval.getStart() ? interval.getStart() - referencePositionStartOfAlignmentBlock : 0;
// ... and add it to the start position to get the actual position from where we want to count.
final int offsetStartOfActualSequenceOnReference = offsetStartOfAlignmentBlockOnReference + offsetStartOfIntervalInAlignmentBlock;

// Similarly, we need to determine the actual end of the sequence we want to consider.
final int referencePositionEndOfAlignmentBlock = referencePositionStartOfAlignmentBlock + alignmentBlock.getLength();
// For that, we find the difference between the end position of the AlignmentBlock and the end of the interval, and subtract it from the offset of end of the AlignmentBlock
final int offsetEndOfActualSequenceOnReference = offsetEndOfAlignmentBlockOnReference - (referencePositionEndOfAlignmentBlock > interval.getEnd() ? referencePositionEndOfAlignmentBlock - interval.getEnd() - 1 : 0);

final int length = offsetEndOfActualSequenceOnReference - offsetStartOfActualSequenceOnReference;

// accumulate start of the overlap block
final EdgingRecordAndOffset recordAndOffset = createRecordAndOffset(rec, offsetStartOfAlignmentBlockInRead + offsetStartOfIntervalInAlignmentBlock, length, referencePositionStartOfAlignmentBlock + offsetStartOfIntervalInAlignmentBlock);
accumulator.get(offsetStartOfActualSequenceOnReference).add(recordAndOffset);

// accumulate end of the overlap block
final EdgingRecordAndOffset recordAndOffsetEnd = createRecordAndOffset(recordAndOffset);
accumulator.get(offsetEndOfActualSequenceOnReference).add(recordAndOffsetEnd);
}
} else {
// If the read is fully contained within the interval, then we don't need to determine the overlaps,
// which will speed this up significantly.

final int length = offsetEndOfAlignmentBlockOnReference - offsetStartOfAlignmentBlockOnReference;

// accumulate start of the alignment block
final EdgingRecordAndOffset recordAndOffset = createRecordAndOffset(rec, offsetStartOfAlignmentBlockInRead, length, referencePositionStartOfAlignmentBlock);
accumulator.get(offsetStartOfAlignmentBlockOnReference).add(recordAndOffset);

// accumulate end of the alignment block
final EdgingRecordAndOffset recordAndOffsetEnd = createRecordAndOffset(recordAndOffset);
accumulator.get(offsetEndOfAlignmentBlockOnReference).add(recordAndOffsetEnd);
}
final int length = refOffsetEndInterval - refOffsetInterval;
// add the alignment block to the accumulator when it starts and when it ends
final EdgingRecordAndOffset recordAndOffset = createRecordAndOffset(rec, readOffset + startShift, length, refPos + startShift);
// accumulate start of the alignment block
accumulator.get(refOffsetInterval).add(recordAndOffset);
final EdgingRecordAndOffset recordAndOffsetEnd = createRecordAndOffset(recordAndOffset);
// accumulate end of the alignment block
accumulator.get(refOffsetEndInterval).add(recordAndOffsetEnd);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,9 @@ public class IntervalListReferenceSequenceMask implements ReferenceSequenceMask

public IntervalListReferenceSequenceMask(final IntervalList intervalList) {
this.header = intervalList.getHeader();
if (intervalList.getHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
intervalList.sorted();
}
final List<Interval> uniqueIntervals = intervalList.uniqued().getIntervals();
final boolean isSorted = intervalList.getHeader().getSortOrder() == SAMFileHeader.SortOrder.coordinate;
final IntervalList sortedIntervalList = isSorted ? intervalList : intervalList.sorted();
final List<Interval> uniqueIntervals = sortedIntervalList.uniqued().getIntervals();
if (uniqueIntervals.isEmpty()) {
lastSequenceIndex = -1;
lastPosition = 0;
Expand Down
Loading

0 comments on commit 347c0ac

Please sign in to comment.