Skip to content

Commit

Permalink
-fixes a bug in the liftover logic whereby when the alleles couldn't … (
Browse files Browse the repository at this point in the history
#1956)

* -fixes a bug in the liftover logic whereby when the alleles couldn't be extended to the "left" because the start of the variant was already at position 1, the resulting VC was corrupt leading to the problem discussed in !1951
  • Loading branch information
yfarjoun authored Apr 11, 2024
1 parent 0495b01 commit 372cd5c
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 29 deletions.
80 changes: 57 additions & 23 deletions src/main/java/picard/util/LiftoverUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,6 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
throw new IllegalArgumentException(String.format("Reference allele doesn't match reference at %s:%d-%d", referenceSequence.getName(), start, end));
}

boolean changesInAlleles = true;

final Map<Allele, byte[]> alleleBasesMap = new HashMap<>();

// Put each allele into the alleleBasesMap unless it is a spanning deletion.
Expand All @@ -370,43 +368,75 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
int theStart = start;
int theEnd = end;

// 1. while changes in alleles do
while (changesInAlleles) {
/** the algorithm below is an implementation of the pseudo-code provided here:
* https://genome.sph.umich.edu/wiki/Variant_Normalization
*
* The numbers in the comments match with the numbers in the pseudo-code in this image:
* https://genome.sph.umich.edu/wiki/File:Variant_normalization_algorithm.png
*/

int oldStart, oldEnd;

// while changes in alleles do
do {
oldStart = theStart;
oldEnd = theEnd;

// this block will convert
// REF = "ACG", ALT="ATG"
// to
// REF = "AC", ALT="AT"

changesInAlleles = false;
// 2. if alleles end with the same nucleotide then
if (alleleBasesMap.values().stream()
.collect(Collectors.groupingBy(a -> a[a.length - 1], Collectors.toSet()))
.size() == 1 && theEnd > 1) {
// 3. truncate rightmost nucleotide of each allele
alleleBasesMap.replaceAll((a, v) -> truncateBase(alleleBasesMap.get(a), true));
changesInAlleles = true;
theEnd--;
// 4. end if
}

// This block will extend all the alleles if one of them is empty:
// REF: "" ALT: "A"
// to
// REF: "G" ALT: "GA" (assuming there's previous base in the reference, and it is G)
// or
// REF: "T" ALT: "AT" (if the A was inserted before the first base in the reference sequence, "T" , we anchor on the "T")

// 5. if there exists an empty allele then
if (alleleBasesMap.values().stream()
.map(a -> a.length)
.anyMatch(l -> l == 0)) {
// 6. extend alleles 1 nucleotide to the left
for (final Allele allele : alleleBasesMap.keySet()) {

// The comment above says to extend to the left, but this isn't always possible. The following block
// decides where to extend and with which base.

final byte extraBase;
final boolean extendLeft;
if (theStart > 1) {
// the first -1 for zero-base (getBases) versus 1-based (variant position)
// another -1 to get the base prior to the location of the start of the allele
final byte extraBase = (theStart > 1) ?
referenceSequence.getBases()[theStart - 2] :
referenceSequence.getBases()[theEnd];
extraBase = referenceSequence.getBases()[theStart - 2];
extendLeft = true;
theStart--;
} else {
extraBase = referenceSequence.getBases()[theEnd];
extendLeft = false;
theEnd++;
}

alleleBasesMap.put(allele, extendOneBase(alleleBasesMap.get(allele), extraBase));
// This block actually updates the alleles according (by adding the base to the left or right)
for (final Allele allele : alleleBasesMap.keySet()) {
alleleBasesMap.put(allele, extendOneBase(alleleBasesMap.get(allele), extraBase, extendLeft));
}
changesInAlleles = true;
theStart--;

// 7. end if
}
}
} // 7. end if
// 8. end while (when alleles change either the start or the end should change)
} while (theStart != oldStart || theEnd != oldEnd);

// 8. while leftmost nucleotide of each allele are the same and all alleles have length 2 or more do
// 9. while leftmost nucleotide of each allele are the same and all alleles have length 2 or more do
while (alleleBasesMap.values().stream()
.allMatch(a -> a.length >= 2) &&

Expand All @@ -415,10 +445,10 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
.size() == 1
) {

//9. truncate the leftmost base of the alleles
// 10. truncate the leftmost base of the alleles
alleleBasesMap.replaceAll((a, v) -> truncateBase(alleleBasesMap.get(a), false));
theStart++;
}
} // 11. end while.

builder.start(theStart);
builder.stop(theEnd);
Expand All @@ -431,7 +461,7 @@ protected static void leftAlignVariant(final VariantContextBuilder builder, fina
// a special case here.
fixedAlleleMap.put(Allele.SPAN_DEL, Allele.SPAN_DEL);

// symbolic alleles cannot be left-aligned so they need to be put
// symbolic alleles cannot be left-aligned, so they need to be put
// into the map directly
alleles.stream()
.filter(Allele::isSymbolic)
Expand All @@ -451,12 +481,16 @@ private static byte[] truncateBase(final byte[] allele, final boolean truncateRi
truncateRightmost ? allele.length - 1 : allele.length);
}

//creates a new byte array with the base added at the beginning
private static byte[] extendOneBase(final byte[] bases, final byte base) {
return extendOneBase(bases, base, true);
}

//creates a new byte array with the base added at the beginning
private static byte[] extendOneBase(final byte[] bases, final byte base, final boolean extendLeft) {
final byte[] newBases = new byte[bases.length + 1];

System.arraycopy(bases, 0, newBases, 1, bases.length);
newBases[0] = base;
System.arraycopy(bases, 0, newBases, extendLeft ? 1 : 0, bases.length);
newBases[extendLeft ? 0 : bases.length] = base;

return newBases;
}
Expand Down
58 changes: 52 additions & 6 deletions src/test/java/picard/util/LiftoverVcfTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
import htsjdk.samtools.liftover.LiftOver;
import htsjdk.samtools.reference.FastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.*;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
Expand Down Expand Up @@ -59,6 +56,10 @@ public class LiftoverVcfTest extends CommandLineProgramTest {
private static final String refString = "CAAAAAAAAAACGTACGTACTCTCTCTCTACGT";
private static final ReferenceSequence REFERENCE = new ReferenceSequence("chr1", 0, refString.getBytes());

private static final String refStringWithRepeat = "CGTCGTCGT";

private static final ReferenceSequence REFERENCE_WITH_REPEATS = new ReferenceSequence("chr1", 0, refStringWithRepeat.getBytes());

public String getCommandLineProgramName() {
return LiftoverVcf.class.getSimpleName();
}
Expand Down Expand Up @@ -174,8 +175,8 @@ public void testChangingTagArguments() {
@Test
public void testReverseComplementFailureDoesNotErrorOut() {
final VariantContextBuilder builder = new VariantContextBuilder().source("test").loc("chr1", 1, 4);
final Allele originalRef = Allele.create("CCCC", true);
final Allele originalAlt = Allele.create("C", false);
final Allele originalRef = Allele.create("TCAT", true);
final Allele originalAlt = Allele.create("T", false);
builder.alleles(Arrays.asList(originalRef, originalAlt));

final Interval interval = new Interval("chr1", 1, 4, true, "test ");
Expand All @@ -185,6 +186,26 @@ public void testReverseComplementFailureDoesNotErrorOut() {

// we don't actually care what the results are here -- we just want to make sure that it doesn't fail
final VariantContextBuilder result = LiftoverUtils.reverseComplementVariantContext(builder.make(), interval, refSeq);
Assert.assertNotNull(result);
}


@Test
public void testReverseComplementWithRepeats() {
final VariantContextBuilder builder = new VariantContextBuilder().source("test").loc("chr1", 3, 6);
final Allele originalRef = Allele.create("CATC", true);
final Allele originalAlt = Allele.create("C", false);
builder.alleles(Arrays.asList(originalRef, originalAlt));

final Interval interval = new Interval("chr1", 3, 6, true, "test ");

final String reference = "ATGATGATGA";
final ReferenceSequence refSeq = new ReferenceSequence("chr1", 10, reference.getBytes());

// we don't actually care what the results are here -- we just want to make sure that it doesn't fail
final VariantContextBuilder result = LiftoverUtils.reverseComplementVariantContext(builder.make(), interval, refSeq);
Assert.assertNotNull(result);
Assert.assertTrue(result.getStart() > 0);
}

@DataProvider(name = "dataTestMissingContigInReference")
Expand Down Expand Up @@ -975,6 +996,31 @@ public void testLeftAlignVariants(final VariantContext source, final ReferenceSe
VcfTestUtils.assertEquals(vcb.make(), result);
}

@Test
public void testLeftAlignInRepeatingStart(){
// reference: CGTCGTCGT
// 123456789
// TCGT->T

final VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1");
builder.start(3).stop(6).alleles(CollectionUtil.makeList(Allele.create("TCGT",true), Allele.create("T")));
GenotypeBuilder genotypeBuilder = new GenotypeBuilder();
genotypeBuilder.alleles(builder.getAlleles());
builder.genotypes(genotypeBuilder.make());

VariantContext vc = builder.make();
final List<Allele> origAlleles = new ArrayList<>(builder.getAlleles());

// This path mimics the codepath through the LiftoverUtils.reverseComplementVariantContext but without reverseComplementing
LiftoverUtils.leftAlignVariant(builder, vc.getStart(), vc.getEnd(), vc.getAlleles(), REFERENCE_WITH_REPEATS);
builder.genotypes(LiftoverUtils.fixGenotypes(builder.getGenotypes(), origAlleles, builder.getAlleles()));
VariantContext newVc = builder.make();
final String refString = StringUtil.bytesToString(REFERENCE_WITH_REPEATS.getBases(), newVc.getStart() - 1, newVc.getEnd() - newVc.getStart() + 1);

Assert.assertEquals(refString, "CGTC");
}


@DataProvider(name = "indelNoFlipData")
public Iterator<Object[]> indelNoFlipData() {

Expand Down

0 comments on commit 372cd5c

Please sign in to comment.