From 15725e4eeb5bb4012a634577936f5fc3099f73d2 Mon Sep 17 00:00:00 2001 From: Dror Kessler Date: Mon, 12 Feb 2024 12:12:02 +0200 Subject: [PATCH] code cleanup (of unused methods, etc) --- src/main/java/picard/flow/FlowBasedRead.java | 31 +- .../java/picard/flow/FlowBasedReadUtils.java | 287 +----------------- 2 files changed, 21 insertions(+), 297 deletions(-) diff --git a/src/main/java/picard/flow/FlowBasedRead.java b/src/main/java/picard/flow/FlowBasedRead.java index 25196c4bcb..a3d0db38d8 100644 --- a/src/main/java/picard/flow/FlowBasedRead.java +++ b/src/main/java/picard/flow/FlowBasedRead.java @@ -12,8 +12,8 @@ public class FlowBasedRead { // constants private final double MINIMAL_CALL_PROB = 0.1; - final public static String FLOW_MATRIX_TAG_NAME = "tp"; - final public static String FLOW_MATRIX_T0_TAG_NAME = "t0"; + final private static String FLOW_MATRIX_TAG_NAME = "tp"; + final private static String FLOW_MATRIX_T0_TAG_NAME = "t0"; final public static String MAX_CLASS_READ_GROUP_TAG = "mc"; /** @@ -29,16 +29,14 @@ public class FlowBasedRead { private int[] key; /** - * The maximal length of an hmer that can be encoded (normally in the 10-12 range) + * The maximal length of an hmer that can be encoded (normally in the 10-20 range) */ private int maxHmer; - private double totalMinErrorProbability; - private double perHmerMinErrorProbability; + /** - * The order in which flow key in encoded (See decription for key field). Flow order may be wrapped if a longer one - * needed. + * computed minimal error probability for an hmer */ - private byte[] flowOrder; + private double perHmerMinErrorProbability; /** * The probability matrix for this read. [n][m] position represents that probablity that an hmer of n length will be @@ -119,15 +117,14 @@ private void spreadFlowLengthProbsAcrossCountsAtFlow(final int flowToSpread) { private void readFlowMatrix(final String _flowOrder) { // generate key (base to flow space) - if (fbargs.fillingValue == 0) { - totalMinErrorProbability = estimateFillingValue(); - } else { - totalMinErrorProbability = fbargs.fillingValue; - } - perHmerMinErrorProbability = totalMinErrorProbability/getMaxHmer(); + // establish min probability + final double totalMinErrorProbability = (fbargs.fillingValue == 0) ? estimateFillingValue() : fbargs.fillingValue; + perHmerMinErrorProbability = totalMinErrorProbability / getMaxHmer(); + + // generate flow key key = FlowBasedReadUtils.baseArrayToKey(samRecord.getReadBases(), _flowOrder); - flowOrder = FlowBasedReadUtils.getFlowToBase(_flowOrder, key.length); + // initialize matrix flowMatrix = new double[maxHmer + 1][key.length]; for (int i = 0; i < maxHmer + 1; i++) { @@ -170,7 +167,7 @@ private void readFlowMatrix(final String _flowOrder) { } if ((run == 0) && (specialTreatmentForZeroCalls)) { - parseZeroQuals(t0probs, i, qualOfs); + parseZeroQuals(t0probs, i, qualOfs, totalMinErrorProbability); } double totalErrorProb = 0; @@ -208,7 +205,7 @@ private void parseSingleHmer(final double[] probs, final byte[] tp, final int fl // place it on the neighboring bases and choose the **lower** error probability between the // neighbors (that's how T0 encoding works). The error is placed only on the 1-mer error assuming // that 2->0 errors are negligibly rare. - private void parseZeroQuals(final double[] probs, final int flowIdx, final int qualOfs) { + private void parseZeroQuals(final double[] probs, final int flowIdx, final int qualOfs, final double totalMinErrorProbability) { if ((qualOfs == 0) | (qualOfs == probs.length)) { // do not report zero error probability on the edge of the read return; } diff --git a/src/main/java/picard/flow/FlowBasedReadUtils.java b/src/main/java/picard/flow/FlowBasedReadUtils.java index 8dcdf6aabc..fa7f3a92f8 100644 --- a/src/main/java/picard/flow/FlowBasedReadUtils.java +++ b/src/main/java/picard/flow/FlowBasedReadUtils.java @@ -1,8 +1,6 @@ package picard.flow; -import com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.*; -import htsjdk.samtools.util.SequenceUtil; import picard.PicardException; import java.util.ArrayList; @@ -63,163 +61,33 @@ static public int[] baseArrayToKey(final byte[] bases, final String flowOrder){ return ret; } - /** - * For every flow of the key output the index of the last base that was output prior to this flow - * @param key given key - * @return array - */ - static public int[] getKeyToBase(final int[] key) { - final int[] result = new int[key.length]; - result[0] = -1; - for (int i = 1; i < result.length; i++) { - result[i] = result[i - 1] + key[i - 1]; - } - return result; - - } - - /** - * For every flow of the key output the nucleotide that is being read for this flow - * @param flowOrder given flow order - * @param expectedLength the length of the key (key is not provided) - * @return array of bases - */ - - static public byte[] getFlowToBase(final String flowOrder, final int expectedLength) { - final byte[] result = new byte[expectedLength] ; - for ( int i = 0; i < result.length; i++ ) { - result[i] = (byte)flowOrder.charAt(i%flowOrder.length()); - } - return result; - } - - /** - * Prints the key as character-encoded string - * @param ints (key array) - * @return encoded string - */ - static public String keyAsString(final int[] ints) - { - final StringBuilder sb = new StringBuilder(); - - for ( final int i : ints ) - sb.append((char)((i < 10) ? ('0' + i) : ('A' + i - 10))); - - return sb.toString(); - } - - /** - * Converts a numerical array into flow space by flattening per-base elements to fill empty flows based on minimum scores. - * This is intended to make it easier to transform per-base read statistics that are computed on the read in base-space - * sanely into flow-space reads in a reasonable fashion that makes sense for parameters like indel-likelihoods. - * - * The rules are as follows: - * - For every run of homopolymers, the minimum score for any given base is translated to cover the whole run - * - For every 0 base flow, the score from the previous filled flow is copied into place. - * - For the beginning of the array (in the case of preceding 0-flows) the given default value is used. - * - * Example: - * A read with the following bases with a base-space calculated into flow space (ACTG) as: - * TTTATGC -> 0030101101 - * With an input array like this: - * byte[]{1,2,3,4,5,6,7} - * We should expect the following result array (Which matches the key array in length) - * byte[]{defaultQual,defaultQual,1,1,4,4,5,6,6,7} - * - * @param bases base space sequence - * @param keyLength size of the converted bases array in flow-space - * @param baseSpacedArrayToConvert Array of base-space scores to be conformed to flow space - * @param defaultQual default quality to use at the head of the array for non-covered flows - * @param flowOrder flow order - * @return Array of translated bases comparable to the keyLength - */ - @VisibleForTesting - static public byte[] baseArrayToKeySpace(final byte[] bases, final int keyLength, final byte[] baseSpacedArrayToConvert, final byte defaultQual, final String flowOrder){ - - if (bases.length != baseSpacedArrayToConvert.length) { - throw new IllegalArgumentException("Read and qual arrays do not match"); - } - - final byte[] result = new byte[keyLength]; - final byte[] flowOrderBytes = flowOrder.getBytes(); - int loc = 0; - int flowNumber = 0 ; - byte lastQual = defaultQual; - final int period = flowOrderBytes.length; - while ( loc < bases.length ) { - final byte flowBase = flowOrderBytes[flowNumber%period]; - if ((bases[loc]!=flowBase) && ( bases[loc]!='N')) { - result[flowNumber] = lastQual; - } else { - byte qual = Byte.MAX_VALUE; - while ( ( loc < bases.length) && ((bases[loc]==flowBase) || (bases[loc]== 'N')) ){ - qual = (byte)Math.min(baseSpacedArrayToConvert[loc], qual); - loc++; - } - result[flowNumber] = qual; - lastQual = qual; + static public class ReadGroupInfo { + private static final String PLATFORM_ULTIMA = "ULTIMA"; + private static final String PLATFORM_LS454 = "LS454"; - } - flowNumber++; - } - return result; - } - static public class ReadGroupInfo { final public String flowOrder; final public int maxClass; final public boolean isFlowPlatform; - private String reversedFlowOrder = null; public ReadGroupInfo(final SAMReadGroupRecord readGroup) { - if (readGroup.getPlatform()==null){ - isFlowPlatform = false; - } else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform())==NGSPlatform.UNKNOWN){ - isFlowPlatform = false; - } else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform()) == NGSPlatform.LS454) { - //old Ultima data can have PLATFORM==LS454 and not have FO tag - isFlowPlatform = true; - } else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform()) == NGSPlatform.ULTIMA){ - if (readGroup.getFlowOrder()!=null) { - isFlowPlatform = true; - } else { - throw new RuntimeException("Malformed Ultima read group identified, aborting: " + readGroup); - } - } else { - isFlowPlatform = false; - } + isFlowPlatform = PLATFORM_ULTIMA.equals(readGroup.getPlatform()) || PLATFORM_LS454.equals(readGroup.getPlatform()); if (isFlowPlatform) { this.flowOrder = readGroup.getFlowOrder(); - String mc = readGroup.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG); + final String mc = readGroup.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG); this.maxClass = (mc == null) ? FlowBasedRead.MAX_CLASS : Integer.parseInt(mc); - } else { // not a flow platform + } else { this.flowOrder = null; this.maxClass = 0; } } - public synchronized String getReversedFlowOrder() { - if ( reversedFlowOrder == null ) { - reversedFlowOrder = SequenceUtil.reverseComplement(flowOrder); - } - return reversedFlowOrder; - } - - } - - public static boolean hasFlowTags(final SAMRecord rec) { - return rec.hasAttribute(FlowBasedRead.FLOW_MATRIX_TAG_NAME); - } public static synchronized ReadGroupInfo getReadGroupInfo(final SAMFileHeader hdr, final SAMRecord read) { - if ( !hasFlowTags(read) ) { - throw new IllegalArgumentException("read must be flow based: " + read); - } - - String name = read.getReadGroup().getReadGroupId(); + final String name = read.getReadGroup().getReadGroupId(); ReadGroupInfo info = readGroupInfo.get(name); if ( info == null ) { readGroupInfo.put(name, info = new ReadGroupInfo(hdr.getReadGroup(name))); @@ -227,145 +95,4 @@ public static synchronized ReadGroupInfo getReadGroupInfo(final SAMFileHeader hd return info; } - /** - /* - * clips flows from the left to clip the input number of bases - * Needed to trim the haplotype to the read - * Returns number of flows to remove and the change in the left most remaining flow if necessary - */ - static public int[] findLeftClipping(final int baseClipping, final int[] flow2base, final int[] key) { - final int [] result = new int[2]; - if (baseClipping == 0 ){ - return result; - } - - int stopClip = 0; - for (int i = 0 ; i < flow2base.length; i++ ) { - - if (flow2base[i] + key[i] >= baseClipping) { - stopClip = i; - break; - } - } - final int hmerClipped = baseClipping - flow2base[stopClip] - 1; - result[0] = stopClip; - result[1] = hmerClipped; - return result; - } - - /* - * clips flows from the right to trim the input number of bases - * Returns number of flows to remove and the change in the right most flow. - */ - static public int[] findRightClipping(final int baseClipping, final int[] rFlow2Base, final int[] rKey) { - final int [] result = new int[2]; - if (baseClipping == 0 ){ - return result; - } - - int stopClip = 0; - - for (int i = 0; i < rFlow2Base.length; i++ ) { - if (rFlow2Base[i] + rKey[i] >= baseClipping) { - stopClip = i; - break; - } - } - - final int hmerClipped = baseClipping - rFlow2Base[stopClip] - 1; - result[0] = stopClip; - result[1] = hmerClipped; - return result; - } - - /** - * Retrieve flow matrix modifications matrix from its string argument format. This matrix contains - * logic for modifying the flow matrix as it is read in. If the value of [n] is not zero, - * then the hmer probability for hmer length n will be copied to the [n] position - * - * For the implementation logic, see FlowBasedRead.fillFlowMatrix - */ - static public int[] getFlowMatrixModsInstructions(final String flowMatrixMods, final int maxHmer) { - - if ( flowMatrixMods != null ) { - final int[] flowMatrixModsInstructions = new int[maxHmer + 1]; - - final String[] toks = flowMatrixMods.split(","); - for ( int i = 0 ; i < toks.length - 1 ; i += 2 ) { - final int hmer = Integer.parseInt(toks[i]); - flowMatrixModsInstructions[hmer] = Integer.parseInt(toks[i + 1]); - } - return flowMatrixModsInstructions; - } else { - return null; - } - } - - /** - * A canonical, master list of the standard NGS platforms. These values - * can be obtained (efficiently) from a SAMRecord object with the - * getNGSPlatform method. - */ - public enum NGSPlatform { - // note the order of elements here determines the order of matching operations, and therefore the - // efficiency of getting a NGSPlatform from a string. - ILLUMINA(SequencerFlowClass.DISCRETE, "ILLUMINA", "SLX", "SOLEXA"), - SOLID(SequencerFlowClass.DISCRETE, "SOLID"), - LS454(SequencerFlowClass.FLOW, "454", "LS454"), - COMPLETE_GENOMICS(SequencerFlowClass.DISCRETE, "COMPLETE"), - PACBIO(SequencerFlowClass.DISCRETE, "PACBIO"), - ION_TORRENT(SequencerFlowClass.FLOW, "IONTORRENT"), - CAPILLARY(SequencerFlowClass.OTHER, "CAPILLARY"), - HELICOS(SequencerFlowClass.OTHER, "HELICOS"), - ULTIMA(SequencerFlowClass.FLOW, "ULTIMA"), - UNKNOWN(SequencerFlowClass.OTHER, "UNKNOWN"); - - - /** - * Array of the prefix names in a BAM file for each of the platforms. - */ - protected final String[] BAM_PL_NAMES; - protected final SequencerFlowClass sequencerType; - - NGSPlatform(final SequencerFlowClass type, final String... BAM_PL_NAMES) { - if ( BAM_PL_NAMES.length == 0 ) throw new IllegalStateException("Platforms must have at least one name"); - - for ( int i = 0; i < BAM_PL_NAMES.length; i++ ) - BAM_PL_NAMES[i] = BAM_PL_NAMES[i].toUpperCase(); - - this.BAM_PL_NAMES = BAM_PL_NAMES; - this.sequencerType = type; - } - - /** - * Returns the NGSPlatform corresponding to the PL tag in the read group - * @param plFromRG -- the PL field (or equivalent) in a ReadGroup object. Can be null => UNKNOWN - * @return an NGSPlatform object matching the PL field of the header, or UNKNOWN if there was no match or plFromRG is null - */ - public static NGSPlatform fromReadGroupPL(final String plFromRG) { - if ( plFromRG == null ) return UNKNOWN; - - final String pl = plFromRG.toUpperCase(); - for ( final NGSPlatform ngsPlatform : NGSPlatform.values() ) { - for ( final String bamPLName : ngsPlatform.BAM_PL_NAMES ) { - if ( pl.contains(bamPLName) ) - return ngsPlatform; - } - } - - return UNKNOWN; - } - - /** - * In broad terms, each sequencing platform can be classified by whether it flows nucleotides in some order - * such that homopolymers get sequenced in a single event (ie 454 or Ion) or it reads each position in the - * sequence one at a time, regardless of base composition (Illumina or Solid). This information is primarily - * useful in the BQSR process - */ - public enum SequencerFlowClass { - DISCRETE, - FLOW, - OTHER //Catch-all for unknown platforms, as well as relics that GATK doesn't handle well (Capillary, Helicos) - } - } }