From 8e5bf30bf9c4f24b31fea42a602b7f0efbaceae6 Mon Sep 17 00:00:00 2001
From: Ammar Aziz <spidermine63@gmail.com>
Date: Fri, 8 Mar 2024 17:10:49 +1100
Subject: [PATCH] upstream: Align trim (#2)

* adding --min-depth flag to minion and parameterising the mask and stats calls with it

* parameterising the vcf filter with the same global min-depth variable for consistency

* adding github actions CI

* adding github actions CI

* adding github actions CI

* adding github actions CI

* moving unit tests over to github actions

* parameterising the vcf filter with the same global min-depth variable for consistency

* replacing travis with github action CI

* adding DOI to README

* updating artic-tools, adding more stats to artic report, running report regardless of strict, setting strict to filter VCF

* adding opt to enforce full length amplicon match during align trim, activate using --strict flag to minion

* tweaking report generator

* version bump

* docs update

* replacing longshot with the latest medaka

* trying to fix medaka install on osx

* adding bcftools norm command to remove vars from the pass VCF if they have been masked out in the preconsensus

* cleaner parsing of GQ field

* upgrading to latest medaka and dropping longshot from the conda env

* exposing opts for vcf_filter

* bumping medaka to latest patch

* unpinning pandas

* Change to overlap-based method of amplicon resolution

* Integrate new align_trim.py

---------

Co-authored-by: Will Rowe <w.p.m.rowe@gmail.com>
Co-authored-by: Chris Wright <chris.wright@nanoporetech.com>
---
 artic/align_trim.py | 352 ++++++++++++++++++++++++--------------------
 artic/minion.py     |  18 ++-
 2 files changed, 212 insertions(+), 158 deletions(-)

diff --git a/artic/align_trim.py b/artic/align_trim.py
index 4ade82b3..96dabc67 100755
--- a/artic/align_trim.py
+++ b/artic/align_trim.py
@@ -1,51 +1,31 @@
 #!/usr/bin/env python
 
-# Written by Nick Loman
-
 from copy import copy
-from collections import defaultdict
-import pysam
+from collections import defaultdict, namedtuple
 import sys
-from .vcftagprimersites import read_bed_file
 
+import numpy as np
+import pysam
+
+from artic.vcftagprimersites import read_bed_file
+
+PassRead = namedtuple(
+    'PassRead',
+    ('qname', 'amplicon', 'coverage', 'left', 'right'))
 # consumesReference lookup for if a CIGAR operation consumes the reference sequence
 consumesReference = [True, False, True, True, False, False, False, True]
-
 # consumesQuery lookup for if a CIGAR operation consumes the query sequence
 consumesQuery = [True, True, False, False, True, False, False, True]
 
 
-def find_primer(bed, pos, direction):
-    """Given a reference position and a direction of travel, walk out and find the nearest primer site.
-
-    Parameters
-    ----------
-    bed : list
-        A list of dictionaries, where each dictionary contains a row of bedfile data
-    pos : int
-        The position in the reference sequence to start from
-    direction : string
-        The direction to search along the reference sequence
-
-    Returns
-    -------
-    tuple
-        The offset, distance and bed entry for the closest primer to the query position
-    """
-    from operator import itemgetter
-
-    if direction == '+':
-        closest = min([(abs(p['start'] - pos), p['start'] - pos, p)
-                       for p in bed if p['direction'] == direction], key=itemgetter(0))
-    else:
-        closest = min([(abs(p['end'] - pos), p['end'] - pos, p)
-                       for p in bed if p['direction'] == direction], key=itemgetter(0))
-    return closest
+def overlap_size(o1, o2):
+    s1, e1 = sorted(o1)
+    s2, e2 = sorted(o2)
+    return min(e1, e2) - max(s1, s2)
 
 
 def trim(segment, primer_pos, end, debug):
     """Soft mask an alignment to fit within primer start/end sites.
-
     Parameters
     ----------
     segment : pysam.AlignedSegment
@@ -146,138 +126,190 @@ def trim(segment, primer_pos, end, debug):
     return
 
 
-def go(args):
-    """Filter and soft mask an alignment file so that the alignment boundaries match the primer start and end sites.
 
-    Based on the most likely primer position, based on the alignment coordinates.
-    """
-    # prepare the report outfile
+def overlap_trim(args):
+    """Detect to which amplicon a read is derived and according to trim primers."""
+
     if args.report:
         reportfh = open(args.report, "w")
-        print("QueryName\tReferenceStart\tReferenceEnd\tPrimerPair\tPrimer1\tPrimer1Start\tPrimer2\tPrimer2Start\tIsSecondary\tIsSupplementary\tStart\tEnd\tCorrectlyPaired", file=reportfh)
-
-    # set up a counter to track amplicon abundance
-    counter = defaultdict(int)
+        print(
+            "QueryName\tReferenceStart\tReferenceEnd\t"
+            "PrimerPair\t"
+            "Primer1\tPrimer1Start\t"
+            "Primer2\tPrimer2Start\t"
+            "IsSecondary\tIsSupplementary\t"
+            "Start\tEnd\tCorrectlyPaired", file=reportfh)
 
     # open the primer scheme and get the pools
     bed = read_bed_file(args.bedfile)
-    pools = set([row['PoolName'] for row in bed])
+    pools = set(row['PoolName'] for row in bed)
     pools.add('unmatched')
-
-    # open the input SAM file and process read groups
-    infile = pysam.AlignmentFile("-", "rb")
-    bam_header = infile.header.copy().to_dict()
-    if not args.no_read_groups:
-        bam_header['RG'] = []
-        for pool in pools:
-            read_group = {}
-            read_group['ID'] = pool
-            bam_header['RG'].append(read_group)
-
-    # prepare the alignment outfile
-    outfile = pysam.AlignmentFile("-", "wh", header=bam_header)
+    primer_pairs = defaultdict(dict)
+    for b in bed:
+       scheme, pair, side = b['Primer_ID'].split('_')
+       primer_pairs[pair][side] = b
+    # this data structure is more useful for searching...
+    amplicons = np.fromiter((
+        (k, v['LEFT']['PoolName'],
+            v['LEFT']['end'], v['RIGHT']['start'],  # just insert
+            v['LEFT']['start'], v['RIGHT']['end'],  # contains primers
+            v['LEFT']['Primer_ID'], v['RIGHT']['Primer_ID'])
+            for k, v in primer_pairs.items()),
+        dtype=[
+            ('name', int), ('pool', int),
+            ('insert_start', int), ('insert_end', int),
+            ('start', int), ('end', int),
+            ('left_primer', 'U20'), ('right_primer', 'U20')])
 
     # iterate over the alignment segments in the input SAM file
-    for segment in infile:
-
-        # filter out unmapped and supplementary alignment segments
-        if segment.is_unmapped:
-            print("%s skipped as unmapped" %
-                  (segment.query_name), file=sys.stderr)
-            continue
-        if segment.is_supplementary:
-            print("%s skipped as supplementary" %
-                  (segment.query_name), file=sys.stderr)
-            continue
-
-        # locate the nearest primers to this alignment segment
-        p1 = find_primer(bed, segment.reference_start, '+')
-        p2 = find_primer(bed, segment.reference_end, '-')
-
-        # check if primers are correctly paired and then assign read group
-        # NOTE: removed this as a function as only called once
-        # TODO: will try improving this / moving it to the primer scheme processing code
-        correctly_paired = p1[2]['Primer_ID'].replace(
-            '_LEFT', '') == p2[2]['Primer_ID'].replace('_RIGHT', '')
+    passing_reads = defaultdict(list)  # by (amplicon, is_reverse)
+    with pysam.AlignmentFile(args.bamfile, "rb") as bam:
+        bam_header = bam.header.copy().to_dict()
         if not args.no_read_groups:
-            if correctly_paired:
-                segment.set_tag('RG', p1[2]['PoolName'])
-            else:
-                segment.set_tag('RG', 'unmatched')
-        if args.remove_incorrect_pairs and not correctly_paired:
-            print("%s skipped as not correctly paired" %
-                  (segment.query_name), file=sys.stderr)
-            continue
-
-        # ignore the read if it does not cover the entire amplicon
-        if args.enforce_amplicon_span and (segment.query_alignment_length < abs(p2[2]['start'] - p1[2]['end'])):
-            print("%s skipped as not full length amplicon match" % (segment.query_name), file=sys.stderr)
-            continue
-
-        # update the report with this alignment segment + primer details
-        report = "%s\t%s\t%s\t%s_%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d" % (segment.query_name, segment.reference_start, segment.reference_end, p1[2]['Primer_ID'], p2[2]['Primer_ID'], p1[2]['Primer_ID'], abs(
-            p1[1]), p2[2]['Primer_ID'], abs(p2[1]), segment.is_secondary, segment.is_supplementary, p1[2]['start'], p2[2]['end'], correctly_paired)
-        if args.report:
-            print(report, file=reportfh)
-        if args.verbose:
-            print(report, file=sys.stderr)
-
-        # get the primer positions
-        if args.start:
-            p1_position = p1[2]['start']
-            p2_position = p2[2]['end']
-        else:
-            p1_position = p1[2]['end']
-            p2_position = p2[2]['start']
+            bam_header['RG'] = []
+            for pool in pools:
+                read_group = {}
+                read_group['ID'] = pool
+                bam_header['RG'].append(read_group)
+
+        for segment in bam:
+            # filter out unmapped and supplementary alignment segments
+            if segment.is_unmapped:
+                print("%s skipped as unmapped" %
+                      (segment.query_name), file=sys.stderr)
+                continue
+            if segment.is_supplementary:
+                print("%s skipped as supplementary" %
+                      (segment.query_name), file=sys.stderr)
+                continue
 
-        # softmask the alignment if left primer start/end inside alignment
-        if segment.reference_start < p1_position:
-            try:
-                trim(segment, p1_position, False, args.verbose)
-                if args.verbose:
-                    print("ref start %s >= primer_position %s" %
-                          (segment.reference_start, p1_position), file=sys.stderr)
-            except Exception as e:
-                print("problem soft masking left primer in {} (error: {}), skipping" .format(
-                    segment.query_name, e), file=sys.stderr)
+            # determine the amplicon by largest overlap (and find second best)
+            overlaps = np.zeros(len(amplicons), dtype=int)
+            for i, amp in enumerate(amplicons):
+                overlaps[i] = segment.get_overlap(amp['start'], amp['end'])
+            best, second = np.argpartition(overlaps, -2)[:-3:-1]
+            if overlaps[best] < args.min_overlap:
+                print("%s skipped as no good overlap" % segment.qname, file=sys.stderr)
                 continue
 
-        # softmask the alignment if right primer start/end inside alignment
-        if segment.reference_end > p2_position:
-            try:
-                trim(segment, p2_position, True, args.verbose)
-                if args.verbose:
-                    print("ref start %s >= primer_position %s" %
-                          (segment.reference_start, p2_position), file=sys.stderr)
-            except Exception as e:
-                print("problem soft masking right primer in {} (error: {}), skipping" .format(
-                    segment.query_name, e), file=sys.stderr)
+            # don't take reads if the second best overlap is a large proportion
+            # of the mutual overlap of the amplicons
+            best_amplicon = amplicons[best]
+            second_amplicon = amplicons[second]
+            mutual = overlap_size(
+                *((amp['start'], amp['end'])
+                for amp in (best_amplicon, second_amplicon)))
+            if overlaps[second] > args.max_mutual_overlap * mutual:
+                print("%s skipped as large secondary overlap" % segment.qname, file=sys.stderr)
+                continue
+            overlap = overlaps[best]
+            amplicon = amplicons[best]
+
+            # check whether the alignment extends to the primer at each end
+            extends = (
+                segment.reference_start <= amplicon['insert_start'],
+                segment.reference_end >= amplicon['insert_end'])
+
+            # if both primers, we call that "correctly paired"
+            if args.enforce_amplicon_span and not all(extends):
+                print("%s skipped as does not span: %s" %
+                    (segment.query_name, extends), file=sys.stderr)
                 continue
+            passing_reads[amplicon['name'], segment.is_reverse].append(
+                PassRead(segment.qname, amplicon['name'], overlap, *extends))
+    # end first pass
+
+    # filter alignments
+    print("Reads before filtering: {}".format(sum(len(x) for x in passing_reads.values())), file=sys.stderr)
+    chosen_reads = list()
+    if args.normalise:
+        for (amp, is_reverse), reads in passing_reads.items():
+            reads = sorted(reads, key=lambda x: x.coverage, reverse=True)
+            chosen_reads.extend(reads[0:args.normalise])
+    else:
+        for reads in passing_reads.values():
+            chosen_reads.extend(reads)
+    print("Reads after filtering: {}".format(len(chosen_reads)), file=sys.stderr)
+    chosen_reads = {r.qname:r for r in chosen_reads}
+
+    with \
+            pysam.AlignmentFile(args.bamfile, "rb") as bam, \
+            pysam.AlignmentFile("-", "wh", header=bam_header) as outfile:
+        for segment in bam:
+            wanted = segment.qname in chosen_reads
+            if not wanted or segment.is_unmapped or segment.is_supplementary:
+                continue
+            chosen = chosen_reads[segment.qname]
+            amplicon = amplicons[amplicons['name'] == chosen.amplicon]
+            if not args.no_read_groups:
+                segment.set_tag('RG', str(amplicon['pool'][0]))
 
-        # normalise if requested
-        if args.normalise:
-            pair = "%s-%s-%d" % (p1[2]['Primer_ID'],
-                                 p2[2]['Primer_ID'], segment.is_reverse)
-            counter[pair] += 1
-            if counter[pair] > args.normalise:
-                print("%s dropped as abundance theshold reached" %
+            if len(amplicon) > 1:
+                raise IndexError("Found more than one amplicon matching: {}".format(chosen))
+            else:
+                amplicon = amplicon[0]
+            if args.start:
+                trim_start, trim_end = amplicon['start'], amplicon['end']
+            else:
+                trim_start, trim_end = amplicon['insert_start'], amplicon['insert_end']
+
+            # softmask the alignment if left primer start/end inside alignment
+            if segment.reference_start < trim_start:
+                try:
+                    trim(segment, trim_start, False, args.verbose)
+                    if args.verbose:
+                        print("ref start %s < primer_position %s" %
+                            (segment.reference_start, trim_start),
+                            file=sys.stderr)
+                except Exception as e:
+                    print(
+                        "problem soft masking left primer in {} (error: {}), skipping".format(
+                            segment.query_name, e),
+                        file=sys.stderr)
+                    continue
+
+            # softmask the alignment if right primer start/end inside alignment
+            if segment.reference_end > trim_end:
+                try:
+                    trim(segment, trim_end, True, args.verbose)
+                    if args.verbose:
+                        print("ref end %s > primer_position %s" %
+                            (segment.reference_start, p2_position), file=sys.stderr)
+                except Exception as e:
+                    print(
+                        "problem soft masking right primer in {} (error: {}), skipping".format(
+                            segment.query_name, e),
+                        file=sys.stderr)
+                    continue
+
+            # check the the alignment still contains bases matching the reference
+            if 'M' not in segment.cigarstring:
+                print("%s dropped as does not match reference post masking" %
                       (segment.query_name), file=sys.stderr)
                 continue
 
-        # check the the alignment still contains bases matching the reference
-        if 'M' not in segment.cigarstring:
-            print("%s dropped as does not match reference post masking" %
-                  (segment.query_name), file=sys.stderr)
-            continue
-
-        # current alignment segment has passed filters, send it to the outfile
-        outfile.write(segment)
-
-    # close up the file handles
-    infile.close()
-    outfile.close()
-    if args.report:
-        reportfh.close()
+            # current alignment segment has passed filters, send it to the outfile
+            # update the report with this alignment segment + primer details
+            if args.report or args.verbose:
+                #"QueryName\tReferenceStart\tReferenceEnd\t"
+                #"PrimerPair\t"
+                #"Primer1\tPrimer1Start\t"
+                #"Primer2\tPrimer2Start\t"
+                #"IsSecondary\tIsSupplementary\t"
+                #"Start\tEnd\tCorrectlyPaired", file=reportfh)
+                report = '\t'.join(
+                    str(x) for x in (
+                        segment.query_name, segment.reference_start, segment.reference_end,
+                        '{}_{}'.format(amplicon['left_primer'], amplicon['right_primer']),
+                        amplicon['left_primer'], amplicon['start'],
+                        amplicon['right_primer'], amplicon['insert_end'],
+                        segment.is_secondary, segment.is_supplementary,
+                        amplicon['start'], amplicon['end'], int(all(extends))))
+                if args.report:
+                    print(report, file=reportfh)
+                if args.verbose:
+                    print(report, file=sys.stderr)
+            outfile.write(segment)
 
 
 def main():
@@ -285,21 +317,29 @@ def main():
 
     parser = argparse.ArgumentParser(
         description='Trim alignments from an amplicon scheme.')
-    parser.add_argument(
-        'bedfile', help='BED file containing the amplicon scheme')
+    parser.add_argument('bamfile',
+                        help='alignment file.')
+    parser.add_argument('bedfile',
+                        help='BED file containing the amplicon scheme.')
+    parser.add_argument('--min-overlap', type=int, default=0, dest='min_overlap',
+                        help='Shortest allowed overlap to amplicon region')
+    parser.add_argument('--max-mutual-overlap', type=int, default=2, dest='max_mutual_overlap',
+                        help='Maximum permissable overlap to second amplicon region as proportion of amplicon mutual overlap')
     parser.add_argument('--normalise', type=int,
                         help='Subsample to n coverage per strand')
-    parser.add_argument('--report', type=str, help='Output report to file')
+    parser.add_argument('--report', type=str,
+                        help='Output report to file')
     parser.add_argument('--start', action='store_true',
                         help='Trim to start of primers instead of ends')
-    parser.add_argument('--no-read-groups', dest='no_read_groups',
-                        action='store_true', help='Do not divide reads into groups in SAM output')
-    parser.add_argument('--verbose', action='store_true', help='Debug mode')
-    parser.add_argument('--remove-incorrect-pairs', action='store_true')
-    parser.add_argument('--enforce-amplicon-span', dest='enforce_amplicon_span', action='store_true', help='Discard reads that don\'t cover the entire amplicon')
+    parser.add_argument('--no-read-groups', dest='no_read_groups', action='store_true',
+                        help='Do not divide reads into groups in SAM output')
+    parser.add_argument('--verbose', action='store_true',
+                        help='Debug mode')
+    parser.add_argument('--enforce-amplicon-span', action='store_true', dest='enforce_amplicon_span',
+                        help='Discard reads that do not cover the entire amplicon')
 
     args = parser.parse_args()
-    go(args)
+    overlap_trim(args)
 
 
 if __name__ == "__main__":
diff --git a/artic/minion.py b/artic/minion.py
index 3a8d9858..98e8e09f 100755
--- a/artic/minion.py
+++ b/artic/minion.py
@@ -197,8 +197,22 @@ def run(parser, args):
         strict_string = '--enforce-amplicon-span'
     else:
         strict_string = ''
-    cmds.append("align_trim %s %s %s --start --remove-incorrect-pairs --report %s.alignreport.txt < %s.sorted.bam 2> %s.alignreport.er | samtools sort -T %s - -o %s.trimmed.rg.sorted.bam" % (strict_string, normalise_string, bed, args.sample, args.sample, args.sample, args.sample, args.sample))
-    cmds.append("align_trim %s %s %s --remove-incorrect-pairs --report %s.alignreport.txt < %s.sorted.bam 2> %s.alignreport.er | samtools sort -T %s - -o %s.primertrimmed.rg.sorted.bam" % (strict_string, normalise_string, bed, args.sample, args.sample, args.sample, args.sample, args.sample))
+    cmds.append(
+        "align_trim %s.sorted.bam %s "
+        "%s %s --start --report %s.alignreport.txt 2> %s.alignreport.er "
+        "| samtools sort -T %s - -o %s.trimmed.rg.sorted.bam" % (
+            args.sample, bed,
+            strict_string, normalise_string, args.sample, args.sample,
+            args.sample, args.sample)
+        )
+    cmds.append(
+        "align_trim %s.sorted.bam %s "
+        "%s %s --report %s.alignreport.txt 2> %s.alignreport.er "
+        "| samtools sort -T %s - -o %s.primertrimmed.rg.sorted.bam" % (
+            args.sample, bed,
+            strict_string, normalise_string, args.sample, args.sample,
+            args.sample, args.sample)
+        )
     cmds.append("samtools index %s.trimmed.rg.sorted.bam" % (args.sample))
     cmds.append("samtools index %s.primertrimmed.rg.sorted.bam" % (args.sample))