Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix full.aln to have correct VARIANT count when running snippy-core #531

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ chr 100541 del CAAA CAA CAA:38 CAAA:1 CDS + E
plas 619 complex GATC AATA GATC:28 AATA:0
plas 3221 mnp GA CT CT:39 CT:0 CDS + ECO_p012 rep hypothetical protein

% snippy-core --prefix core mysnps1 mysnps2 mysnps3 mysnps4
% snippy-core --corefreq .95 --prefix core mysnps1 mysnps2 mysnps3 mysnps4
Loaded 4 SNP tables.
Found 2814 core SNPs from 96615 SNPs.

% ls core.*
core.aln core.tab core.tab core.txt core.vcf
core.aln core.f.95.aln core.full.aln core.tab core.tab core.txt core.vcf
```

# Installation
Expand Down Expand Up @@ -101,7 +101,7 @@ Extension | Description
.bam | The alignments in [BAM](http://en.wikipedia.org/wiki/SAMtools) format. Includes unmapped, multimapping reads. Excludes duplicates.
.bam.bai | Index for the .bam file
.log | A log file with the commands run and their outputs
.aligned.fa | A version of the reference but with `-` at position with `depth=0` and `N` for `0 < depth < --mincov` (**does not have variants**)
.aligned.fa | A version of the reference but with `-` at position with `depth=0` and `N` for `0 < depth < --mincov` (**now have variants**)
.consensus.fa | A version of the reference genome with *all* variants instantiated
.consensus.subs.fa | A version of the reference genome with *only substitution* variants instantiated
.raw.vcf | The unfiltered variant calls from Freebayes
Expand Down
67 changes: 51 additions & 16 deletions bin/snippy-core
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ sub uniq {
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Options

my(@Options, $debug, $check, $noref, $aformat, $maxhap,
my(@Options, $debug, $check, $noref, $aformat, $maxhap, $corefreq,
$in_prefix, $out_prefix, $fasta, $bed,
$gap_char, $mask_char);
setOptions();
Expand Down Expand Up @@ -75,6 +75,8 @@ length($gap_char)==1 or err("--gap-char must be a single character");
#length($pad_char)==1 or err("--pad-char must be a single character");
length($mask_char)==1 or err("--mask-char must be a single character");

$corefreq<=1.0 and $corefreq>=0.0 or err("--corefreq must be a real value within [0,1]");

$in_prefix or err("Please provide --in-prefix");
$out_prefix or err("Please provide --out-prefix");

Expand Down Expand Up @@ -249,45 +251,56 @@ for my $chr (@$chrom_list) {
}
print $VCF tsv('#CHROM', qw(POS ID REF ALT QUAL FILTER INFO FORMAT), @id);

my @corepos = (); #array of core positions

for my $chr (@$chrom_list) {
msg("Processing contig: $chr");
POS:
for my $pos ( sort { $a <=> $b } keys %{$var{$chr}} )
{
next if $mask{$chr}{$pos}; # BAIL OUT IF THIS SITE IS MASKED
POS: for my $pos ( sort { $a <=> $b } keys %{$var{$chr}} ) {
next POS if $mask{$chr}{$pos}; # BAIL OUT IF THIS SITE IS MASKED

# check how many variants at this site and bail out if none
my @varid = keys %{$var{$chr}{$pos}};
next unless @varid;
next POS unless @varid;

my $r = substr($chrom->{$chr}, $pos-1, 1);
my @vcf = ( $chr, $pos, '.', $r );
my @tab = ( $chr, $pos, $r );
my %ALT;

# check ALL samples not just 'var' ones, as might be absent, so not core
for my $id (@id) {
my %ALT = ();
my $hits = 0;
my %subalts = ();
# check ALL samples not just 'var' ones, as might be absent, to include into core or not
ID: for my $id (@id) {
# check if this site has sample support ie. not -,N,n etc
my $ac = substr( $seq{$id}{$chr}, $pos-1, 1 );
$ac or err("Could not extract base $chr:".($pos-1)." for isolate $id");
next POS unless $ac =~ m/[AGTC]/i;
next ID unless $ac =~ m/[AGTC]/i;
$hits++;
# get ALT
my $alt = $var{$chr}{$pos}{$id};
next unless $alt; # no variant here
next ID unless $alt; # no variant here
#err("Bad ALT=$alt") unless $alt =~ m/[AGTC-]/;
#err("Long ALT=$alt") if length($alt) > 1;
# patch in ALT
substr($seq{$id}{$chr}, $pos-1, 1) = lc($alt); # could be '-'
$subalts{$id} = lc($alt);
# if insertion, we don't put in core
next POS unless $alt =~ m/[AGTC]/i;
next ID unless $alt =~ m/[AGTC-]/i;
$ALT{$id} = $alt;
}
next POS unless $hits >= ($#id + 1) * $corefreq;
# patch in ALT of those applicable
keys %subalts;
while(my ($k, $v) = each %subalts)
{
substr($seq{$k}{$chr}, $pos-1, 1) = $v;
}

push @corepos, $pos unless grep{$_ == $pos} @corepos;

my @GT = sort { $a cmp $b } uniq values %ALT;
my %GT_of = map { ($GT[$_-1] => $_) } (1 .. @GT);
$GT_of{$r} = 0;
# print Dumper($r, \%ALT, \@GT, \%GT_of);

push @vcf, join(",", @GT), '.', 'PASS', 'TYPE=snp', 'GT';
push @vcf, join(",", @GT), '.', 'PASS', join('', 'TYPE=', grep(/^[ACGT]$/, @GT)?'snp ':'', grep(/^-$/, @GT)?'del':''), 'GT';
push @vcf, map { $GT_of{$ALT{$_} || $r} || '0' } @id;
print $VCF tsv(@vcf);

Expand Down Expand Up @@ -320,6 +333,14 @@ for my $id ('Reference', @id) {
}
save_fasta_hash($full_aln, \%out);

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# PREFIX.core.aln
# Extract from full.aln with the core positions
if($corefreq < 1){
my $fcore_aln = "$out_prefix.f$corefreq.aln";
msg("Generating $fcore_aln");
save_pos_fasta_hash($fcore_aln, \%out, \@corepos);
}
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# PREFIX.txt file
my @stats = ([ qw(ID LENGTH ALIGNED UNALIGNED VARIANT HET MASKED LOWCOV) ]);
Expand Down Expand Up @@ -414,6 +435,19 @@ sub save_fasta_hash {
}
return $hash;
}
#----------------------------------------------------------------------
sub save_pos_fasta_hash {
my($fname, $hash , $cpos, $order) = @_;
msg("Saving FASTA: $fname") if $debug;
my $out = Bio::SeqIO->new(-file=>">$fname", -format=>"fasta", -alphabet=>'dna');
$order = [ sort keys %$hash ] unless $order;
for my $id (@$order) {
$out->write_seq(
Bio::Seq->new(-id=>$id, -seq=>join('', map { substr($hash->{$id},$_-1,1) } @$cpos), -alphabet=>'dna')
);
}
return $hash;
}

#----------------------------------------------------------------------
sub msg { print STDERR "@_\n";}
Expand All @@ -434,6 +468,7 @@ sub setOptions {
{OPT=>"ref=s", VAR=>\$fasta, DEFAULT=>'', DESC=>"Reference in FASTA or GENBANK"},
{OPT=>"prefix=s", VAR=>\$out_prefix, DEFAULT=>'core', DESC=>"Output prefix"},
{OPT=>"maxhap=i", VAR=>\$maxhap, DEFAULT=>100, DESC=>"Largest haplotype to decompose"},
{OPT=>"corefreq=f", VAR=>\$corefreq, DEFAULT=>1.0, DESC=>"Minimum aligned freq at a position to be included in core genome"},
{OPT=>"mask=s", VAR=>\$bed, DEFAULT=>'', DESC=>"BED file of sites to mask"},
{OPT=>"gap-char=s", VAR=>\$gap_char, DEFAULT=>'-', DESC=>"Gap/deletion character"},
{OPT=>"mask-char=s", VAR=>\$mask_char, DEFAULT=>'X', DESC=>"Masking character"},
Expand Down
2 changes: 1 addition & 1 deletion perl5/Snippy/Version.pm
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use base Exporter;
@EXPORT_OK = qw(version);
%EXPORT_TAGS = ( 'all' => [ @EXPORT_OK ] );

our $VERSION = "4.6.0";
our $VERSION = "4.7.0-alpha";

use strict;
use File::Basename;
Expand Down