-
Notifications
You must be signed in to change notification settings - Fork 1
/
bamPolish
executable file
·155 lines (107 loc) · 3.93 KB
/
bamPolish
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env perl
use Getopt::Long;
use Pod::Usage;
use strict;
## Author: Ariel Paulson ([email protected]).
## Filters, sorts, indexes, and idxstats an input bam (or sam)
## Catches corrupted bams and retries operations, up to 3 times per op.
##
## NOTE: minimum memory allocation is $cores GB; $mem has effect ONLY when > $cores GB.
my $samtools = 'samtools';
my ($inbam, $outbam, $sort, $aligned, $minqual, $primary, $concordant, $cores, $mem, $nosort);
GetOptions("i=s"=>\$inbam, "o=s"=>\$outbam, "q=i"=>\$minqual, "c=i"=>\$cores, "m=s"=>\$mem, "aligned"=>\$aligned, "concordant"=>\$concordant, "sort"=>\$sort, "primary"=>\$primary);
die "$0: input bam '$inbam' is empty or does not exist!\n" unless -s $inbam;
my $sam = $inbam =~ /\.sam$/;
my $filter = $minqual || $aligned || $primary || $concordant;
$cores = 1 unless $cores;
my $threadmem; # 'samtools sort' per-thread mem
if ($mem) {
my ($memN, $memL) = ($mem =~ /^(\d+)([A-Z])$/);
my $memL2 = $memL eq 'T' ? 'G' : $memL eq 'G' ? 'M' : $memL eq 'M' ? 'K' : $memL eq 'K' ? 'B' : '';
die "$0: memory string '$mem' not understood!\n" unless $memL2;
$threadmem = sprintf("%0.3f", $memN/$cores);
if ($threadmem < 1) {
$threadmem = sprintf("%0.0f", 1024*$threadmem) . $memL2;
} else {
$threadmem .= $memL;
}
$threadmem = '1G' if $mem < $cores;
}
my $tmp = "$outbam.tmp";
my $tempbam = "$tmp.bam";
my $tempsort = "$tmp.sort.bam";
my $corrupt = &validate_bam($inbam);
die "$0: input bam '$inbam' is corrupted!\n" if $corrupt;
if ($filter) {
## Must filter bam before sorting
my $cmd = "$samtools view -h";
$cmd .= " -q $minqual" if $minqual;
$cmd .= " -F 4" if $aligned;
$cmd .= " -F 256" if $primary;
$cmd .= " -f 2" if $concordant;
$cmd .= " $inbam | $samtools view -bS - > $tempbam";
&retry_until_valid($cmd, $tempbam, 3);
} else {
## No filtering
if ($sam) {
system "$samtools view -h -bS $inbam > $tempbam";
} else {
system "ln -sf $inbam $tempbam";
}
}
if ($sort) {
my $cmd = "$samtools sort -@ $cores";
$cmd .= " -m $threadmem" if $threadmem;
$cmd .= " -o $tempsort $tempbam";
&retry_until_valid($cmd, $tempbam, 3);
system "mv -f $tempsort $tempbam";
}
## Index and idxstats
chomp(my $tmp_true = `readlink -f $tempbam`);
chomp(my $out_true = `readlink -f $outbam`);
system "mv -f $tempbam $outbam" if $tmp_true ne $out_true; ## if $inbam==$outbam and no filter or sort, then tmp and out will be the same...
&execute("$samtools index $outbam", 1);
(my $outidx = $outbam) =~ s/\.bam$/.idxstats.txt/;
&execute("$samtools idxstats $outbam > $outidx", 1);
system "rm -f $tmp.*";
print "$0 $inbam complete!\n";
exit;
sub execute {
## Runs command and maybe prints to STDOUT/ERR
my ($cmd, $out) = @_;
my $FH = $out==1 ? *STDOUT : $out==2 ? *STDERR : undef;
print $FH "\n$cmd\n" if $FH;
system $cmd;
}
sub validate_bam {
## Returns non-null if the bam file is corrupted
my $BAM = shift; # /path/to/file.bam
my $MSG = "validate.bam.$$.msg";
my $ERR;
if (-s $BAM) {
system "samtools view $BAM 2> $MSG | head > /dev/null";
chomp($ERR = `cat $MSG`);
system "rm -f $MSG";
} else {
$ERR = "$BAM does not exist or is empty!";
}
return $ERR;
}
sub retry_until_valid {
## Retries an operation N times, or until output bam is OK.
my ($cmd, $testbam, $max_attempts) = @_;
my $attempts;
{
$attempts++;
&execute($cmd, 1);
my $corrupt = &validate_bam($testbam);
if ($corrupt) { # truncated bam or something
print "$corrupt\n*** Bam has errors, retrying operations...\n";
if ($attempts == $max_attempts) {
die "$0: failed $attempts times to produce error-free bam; not trying again!\n";
} else {
redo;
}
}
}
}