From 8e5bf30bf9c4f24b31fea42a602b7f0efbaceae6 Mon Sep 17 00:00:00 2001 From: Ammar Aziz Date: Fri, 8 Mar 2024 17:10:49 +1100 Subject: [PATCH] upstream: Align trim (#2) * adding --min-depth flag to minion and parameterising the mask and stats calls with it * parameterising the vcf filter with the same global min-depth variable for consistency * adding github actions CI * adding github actions CI * adding github actions CI * adding github actions CI * moving unit tests over to github actions * parameterising the vcf filter with the same global min-depth variable for consistency * replacing travis with github action CI * adding DOI to README * updating artic-tools, adding more stats to artic report, running report regardless of strict, setting strict to filter VCF * adding opt to enforce full length amplicon match during align trim, activate using --strict flag to minion * tweaking report generator * version bump * docs update * replacing longshot with the latest medaka * trying to fix medaka install on osx * adding bcftools norm command to remove vars from the pass VCF if they have been masked out in the preconsensus * cleaner parsing of GQ field * upgrading to latest medaka and dropping longshot from the conda env * exposing opts for vcf_filter * bumping medaka to latest patch * unpinning pandas * Change to overlap-based method of amplicon resolution * Integrate new align_trim.py --------- Co-authored-by: Will Rowe Co-authored-by: Chris Wright --- artic/align_trim.py | 352 ++++++++++++++++++++++++-------------------- artic/minion.py | 18 ++- 2 files changed, 212 insertions(+), 158 deletions(-) diff --git a/artic/align_trim.py b/artic/align_trim.py index 4ade82b3..96dabc67 100755 --- a/artic/align_trim.py +++ b/artic/align_trim.py @@ -1,51 +1,31 @@ #!/usr/bin/env python -# Written by Nick Loman - from copy import copy -from collections import defaultdict -import pysam +from collections import defaultdict, namedtuple import sys -from .vcftagprimersites import read_bed_file +import numpy as np +import pysam + +from artic.vcftagprimersites import read_bed_file + +PassRead = namedtuple( + 'PassRead', + ('qname', 'amplicon', 'coverage', 'left', 'right')) # consumesReference lookup for if a CIGAR operation consumes the reference sequence consumesReference = [True, False, True, True, False, False, False, True] - # consumesQuery lookup for if a CIGAR operation consumes the query sequence consumesQuery = [True, True, False, False, True, False, False, True] -def find_primer(bed, pos, direction): - """Given a reference position and a direction of travel, walk out and find the nearest primer site. - - Parameters - ---------- - bed : list - A list of dictionaries, where each dictionary contains a row of bedfile data - pos : int - The position in the reference sequence to start from - direction : string - The direction to search along the reference sequence - - Returns - ------- - tuple - The offset, distance and bed entry for the closest primer to the query position - """ - from operator import itemgetter - - if direction == '+': - closest = min([(abs(p['start'] - pos), p['start'] - pos, p) - for p in bed if p['direction'] == direction], key=itemgetter(0)) - else: - closest = min([(abs(p['end'] - pos), p['end'] - pos, p) - for p in bed if p['direction'] == direction], key=itemgetter(0)) - return closest +def overlap_size(o1, o2): + s1, e1 = sorted(o1) + s2, e2 = sorted(o2) + return min(e1, e2) - max(s1, s2) def trim(segment, primer_pos, end, debug): """Soft mask an alignment to fit within primer start/end sites. - Parameters ---------- segment : pysam.AlignedSegment @@ -146,138 +126,190 @@ def trim(segment, primer_pos, end, debug): return -def go(args): - """Filter and soft mask an alignment file so that the alignment boundaries match the primer start and end sites. - Based on the most likely primer position, based on the alignment coordinates. - """ - # prepare the report outfile +def overlap_trim(args): + """Detect to which amplicon a read is derived and according to trim primers.""" + if args.report: reportfh = open(args.report, "w") - print("QueryName\tReferenceStart\tReferenceEnd\tPrimerPair\tPrimer1\tPrimer1Start\tPrimer2\tPrimer2Start\tIsSecondary\tIsSupplementary\tStart\tEnd\tCorrectlyPaired", file=reportfh) - - # set up a counter to track amplicon abundance - counter = defaultdict(int) + print( + "QueryName\tReferenceStart\tReferenceEnd\t" + "PrimerPair\t" + "Primer1\tPrimer1Start\t" + "Primer2\tPrimer2Start\t" + "IsSecondary\tIsSupplementary\t" + "Start\tEnd\tCorrectlyPaired", file=reportfh) # open the primer scheme and get the pools bed = read_bed_file(args.bedfile) - pools = set([row['PoolName'] for row in bed]) + pools = set(row['PoolName'] for row in bed) pools.add('unmatched') - - # open the input SAM file and process read groups - infile = pysam.AlignmentFile("-", "rb") - bam_header = infile.header.copy().to_dict() - if not args.no_read_groups: - bam_header['RG'] = [] - for pool in pools: - read_group = {} - read_group['ID'] = pool - bam_header['RG'].append(read_group) - - # prepare the alignment outfile - outfile = pysam.AlignmentFile("-", "wh", header=bam_header) + primer_pairs = defaultdict(dict) + for b in bed: + scheme, pair, side = b['Primer_ID'].split('_') + primer_pairs[pair][side] = b + # this data structure is more useful for searching... + amplicons = np.fromiter(( + (k, v['LEFT']['PoolName'], + v['LEFT']['end'], v['RIGHT']['start'], # just insert + v['LEFT']['start'], v['RIGHT']['end'], # contains primers + v['LEFT']['Primer_ID'], v['RIGHT']['Primer_ID']) + for k, v in primer_pairs.items()), + dtype=[ + ('name', int), ('pool', int), + ('insert_start', int), ('insert_end', int), + ('start', int), ('end', int), + ('left_primer', 'U20'), ('right_primer', 'U20')]) # iterate over the alignment segments in the input SAM file - for segment in infile: - - # filter out unmapped and supplementary alignment segments - if segment.is_unmapped: - print("%s skipped as unmapped" % - (segment.query_name), file=sys.stderr) - continue - if segment.is_supplementary: - print("%s skipped as supplementary" % - (segment.query_name), file=sys.stderr) - continue - - # locate the nearest primers to this alignment segment - p1 = find_primer(bed, segment.reference_start, '+') - p2 = find_primer(bed, segment.reference_end, '-') - - # check if primers are correctly paired and then assign read group - # NOTE: removed this as a function as only called once - # TODO: will try improving this / moving it to the primer scheme processing code - correctly_paired = p1[2]['Primer_ID'].replace( - '_LEFT', '') == p2[2]['Primer_ID'].replace('_RIGHT', '') + passing_reads = defaultdict(list) # by (amplicon, is_reverse) + with pysam.AlignmentFile(args.bamfile, "rb") as bam: + bam_header = bam.header.copy().to_dict() if not args.no_read_groups: - if correctly_paired: - segment.set_tag('RG', p1[2]['PoolName']) - else: - segment.set_tag('RG', 'unmatched') - if args.remove_incorrect_pairs and not correctly_paired: - print("%s skipped as not correctly paired" % - (segment.query_name), file=sys.stderr) - continue - - # ignore the read if it does not cover the entire amplicon - if args.enforce_amplicon_span and (segment.query_alignment_length < abs(p2[2]['start'] - p1[2]['end'])): - print("%s skipped as not full length amplicon match" % (segment.query_name), file=sys.stderr) - continue - - # update the report with this alignment segment + primer details - report = "%s\t%s\t%s\t%s_%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d" % (segment.query_name, segment.reference_start, segment.reference_end, p1[2]['Primer_ID'], p2[2]['Primer_ID'], p1[2]['Primer_ID'], abs( - p1[1]), p2[2]['Primer_ID'], abs(p2[1]), segment.is_secondary, segment.is_supplementary, p1[2]['start'], p2[2]['end'], correctly_paired) - if args.report: - print(report, file=reportfh) - if args.verbose: - print(report, file=sys.stderr) - - # get the primer positions - if args.start: - p1_position = p1[2]['start'] - p2_position = p2[2]['end'] - else: - p1_position = p1[2]['end'] - p2_position = p2[2]['start'] + bam_header['RG'] = [] + for pool in pools: + read_group = {} + read_group['ID'] = pool + bam_header['RG'].append(read_group) + + for segment in bam: + # filter out unmapped and supplementary alignment segments + if segment.is_unmapped: + print("%s skipped as unmapped" % + (segment.query_name), file=sys.stderr) + continue + if segment.is_supplementary: + print("%s skipped as supplementary" % + (segment.query_name), file=sys.stderr) + continue - # softmask the alignment if left primer start/end inside alignment - if segment.reference_start < p1_position: - try: - trim(segment, p1_position, False, args.verbose) - if args.verbose: - print("ref start %s >= primer_position %s" % - (segment.reference_start, p1_position), file=sys.stderr) - except Exception as e: - print("problem soft masking left primer in {} (error: {}), skipping" .format( - segment.query_name, e), file=sys.stderr) + # determine the amplicon by largest overlap (and find second best) + overlaps = np.zeros(len(amplicons), dtype=int) + for i, amp in enumerate(amplicons): + overlaps[i] = segment.get_overlap(amp['start'], amp['end']) + best, second = np.argpartition(overlaps, -2)[:-3:-1] + if overlaps[best] < args.min_overlap: + print("%s skipped as no good overlap" % segment.qname, file=sys.stderr) continue - # softmask the alignment if right primer start/end inside alignment - if segment.reference_end > p2_position: - try: - trim(segment, p2_position, True, args.verbose) - if args.verbose: - print("ref start %s >= primer_position %s" % - (segment.reference_start, p2_position), file=sys.stderr) - except Exception as e: - print("problem soft masking right primer in {} (error: {}), skipping" .format( - segment.query_name, e), file=sys.stderr) + # don't take reads if the second best overlap is a large proportion + # of the mutual overlap of the amplicons + best_amplicon = amplicons[best] + second_amplicon = amplicons[second] + mutual = overlap_size( + *((amp['start'], amp['end']) + for amp in (best_amplicon, second_amplicon))) + if overlaps[second] > args.max_mutual_overlap * mutual: + print("%s skipped as large secondary overlap" % segment.qname, file=sys.stderr) + continue + overlap = overlaps[best] + amplicon = amplicons[best] + + # check whether the alignment extends to the primer at each end + extends = ( + segment.reference_start <= amplicon['insert_start'], + segment.reference_end >= amplicon['insert_end']) + + # if both primers, we call that "correctly paired" + if args.enforce_amplicon_span and not all(extends): + print("%s skipped as does not span: %s" % + (segment.query_name, extends), file=sys.stderr) continue + passing_reads[amplicon['name'], segment.is_reverse].append( + PassRead(segment.qname, amplicon['name'], overlap, *extends)) + # end first pass + + # filter alignments + print("Reads before filtering: {}".format(sum(len(x) for x in passing_reads.values())), file=sys.stderr) + chosen_reads = list() + if args.normalise: + for (amp, is_reverse), reads in passing_reads.items(): + reads = sorted(reads, key=lambda x: x.coverage, reverse=True) + chosen_reads.extend(reads[0:args.normalise]) + else: + for reads in passing_reads.values(): + chosen_reads.extend(reads) + print("Reads after filtering: {}".format(len(chosen_reads)), file=sys.stderr) + chosen_reads = {r.qname:r for r in chosen_reads} + + with \ + pysam.AlignmentFile(args.bamfile, "rb") as bam, \ + pysam.AlignmentFile("-", "wh", header=bam_header) as outfile: + for segment in bam: + wanted = segment.qname in chosen_reads + if not wanted or segment.is_unmapped or segment.is_supplementary: + continue + chosen = chosen_reads[segment.qname] + amplicon = amplicons[amplicons['name'] == chosen.amplicon] + if not args.no_read_groups: + segment.set_tag('RG', str(amplicon['pool'][0])) - # normalise if requested - if args.normalise: - pair = "%s-%s-%d" % (p1[2]['Primer_ID'], - p2[2]['Primer_ID'], segment.is_reverse) - counter[pair] += 1 - if counter[pair] > args.normalise: - print("%s dropped as abundance theshold reached" % + if len(amplicon) > 1: + raise IndexError("Found more than one amplicon matching: {}".format(chosen)) + else: + amplicon = amplicon[0] + if args.start: + trim_start, trim_end = amplicon['start'], amplicon['end'] + else: + trim_start, trim_end = amplicon['insert_start'], amplicon['insert_end'] + + # softmask the alignment if left primer start/end inside alignment + if segment.reference_start < trim_start: + try: + trim(segment, trim_start, False, args.verbose) + if args.verbose: + print("ref start %s < primer_position %s" % + (segment.reference_start, trim_start), + file=sys.stderr) + except Exception as e: + print( + "problem soft masking left primer in {} (error: {}), skipping".format( + segment.query_name, e), + file=sys.stderr) + continue + + # softmask the alignment if right primer start/end inside alignment + if segment.reference_end > trim_end: + try: + trim(segment, trim_end, True, args.verbose) + if args.verbose: + print("ref end %s > primer_position %s" % + (segment.reference_start, p2_position), file=sys.stderr) + except Exception as e: + print( + "problem soft masking right primer in {} (error: {}), skipping".format( + segment.query_name, e), + file=sys.stderr) + continue + + # check the the alignment still contains bases matching the reference + if 'M' not in segment.cigarstring: + print("%s dropped as does not match reference post masking" % (segment.query_name), file=sys.stderr) continue - # check the the alignment still contains bases matching the reference - if 'M' not in segment.cigarstring: - print("%s dropped as does not match reference post masking" % - (segment.query_name), file=sys.stderr) - continue - - # current alignment segment has passed filters, send it to the outfile - outfile.write(segment) - - # close up the file handles - infile.close() - outfile.close() - if args.report: - reportfh.close() + # current alignment segment has passed filters, send it to the outfile + # update the report with this alignment segment + primer details + if args.report or args.verbose: + #"QueryName\tReferenceStart\tReferenceEnd\t" + #"PrimerPair\t" + #"Primer1\tPrimer1Start\t" + #"Primer2\tPrimer2Start\t" + #"IsSecondary\tIsSupplementary\t" + #"Start\tEnd\tCorrectlyPaired", file=reportfh) + report = '\t'.join( + str(x) for x in ( + segment.query_name, segment.reference_start, segment.reference_end, + '{}_{}'.format(amplicon['left_primer'], amplicon['right_primer']), + amplicon['left_primer'], amplicon['start'], + amplicon['right_primer'], amplicon['insert_end'], + segment.is_secondary, segment.is_supplementary, + amplicon['start'], amplicon['end'], int(all(extends)))) + if args.report: + print(report, file=reportfh) + if args.verbose: + print(report, file=sys.stderr) + outfile.write(segment) def main(): @@ -285,21 +317,29 @@ def main(): parser = argparse.ArgumentParser( description='Trim alignments from an amplicon scheme.') - parser.add_argument( - 'bedfile', help='BED file containing the amplicon scheme') + parser.add_argument('bamfile', + help='alignment file.') + parser.add_argument('bedfile', + help='BED file containing the amplicon scheme.') + parser.add_argument('--min-overlap', type=int, default=0, dest='min_overlap', + help='Shortest allowed overlap to amplicon region') + parser.add_argument('--max-mutual-overlap', type=int, default=2, dest='max_mutual_overlap', + help='Maximum permissable overlap to second amplicon region as proportion of amplicon mutual overlap') parser.add_argument('--normalise', type=int, help='Subsample to n coverage per strand') - parser.add_argument('--report', type=str, help='Output report to file') + parser.add_argument('--report', type=str, + help='Output report to file') parser.add_argument('--start', action='store_true', help='Trim to start of primers instead of ends') - parser.add_argument('--no-read-groups', dest='no_read_groups', - action='store_true', help='Do not divide reads into groups in SAM output') - parser.add_argument('--verbose', action='store_true', help='Debug mode') - parser.add_argument('--remove-incorrect-pairs', action='store_true') - parser.add_argument('--enforce-amplicon-span', dest='enforce_amplicon_span', action='store_true', help='Discard reads that don\'t cover the entire amplicon') + parser.add_argument('--no-read-groups', dest='no_read_groups', action='store_true', + help='Do not divide reads into groups in SAM output') + parser.add_argument('--verbose', action='store_true', + help='Debug mode') + parser.add_argument('--enforce-amplicon-span', action='store_true', dest='enforce_amplicon_span', + help='Discard reads that do not cover the entire amplicon') args = parser.parse_args() - go(args) + overlap_trim(args) if __name__ == "__main__": diff --git a/artic/minion.py b/artic/minion.py index 3a8d9858..98e8e09f 100755 --- a/artic/minion.py +++ b/artic/minion.py @@ -197,8 +197,22 @@ def run(parser, args): strict_string = '--enforce-amplicon-span' else: strict_string = '' - cmds.append("align_trim %s %s %s --start --remove-incorrect-pairs --report %s.alignreport.txt < %s.sorted.bam 2> %s.alignreport.er | samtools sort -T %s - -o %s.trimmed.rg.sorted.bam" % (strict_string, normalise_string, bed, args.sample, args.sample, args.sample, args.sample, args.sample)) - cmds.append("align_trim %s %s %s --remove-incorrect-pairs --report %s.alignreport.txt < %s.sorted.bam 2> %s.alignreport.er | samtools sort -T %s - -o %s.primertrimmed.rg.sorted.bam" % (strict_string, normalise_string, bed, args.sample, args.sample, args.sample, args.sample, args.sample)) + cmds.append( + "align_trim %s.sorted.bam %s " + "%s %s --start --report %s.alignreport.txt 2> %s.alignreport.er " + "| samtools sort -T %s - -o %s.trimmed.rg.sorted.bam" % ( + args.sample, bed, + strict_string, normalise_string, args.sample, args.sample, + args.sample, args.sample) + ) + cmds.append( + "align_trim %s.sorted.bam %s " + "%s %s --report %s.alignreport.txt 2> %s.alignreport.er " + "| samtools sort -T %s - -o %s.primertrimmed.rg.sorted.bam" % ( + args.sample, bed, + strict_string, normalise_string, args.sample, args.sample, + args.sample, args.sample) + ) cmds.append("samtools index %s.trimmed.rg.sorted.bam" % (args.sample)) cmds.append("samtools index %s.primertrimmed.rg.sorted.bam" % (args.sample))