forked from sanahmed/PhaME
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_repeat_coords.pl
153 lines (139 loc) · 3.89 KB
/
get_repeat_coords.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/perl
use strict;
use Getopt::Long;
use FindBin;
my $identity=95;
my $len_cutoff=0;
my $output="repeats_coords.txt";
my $stats= "repeats_stats.txt";
GetOptions(
'i=i' => \$identity,
'l=i' => \$len_cutoff,
'o=s' => \$output,
's=s' => \$stats,
'help|?' => sub{Usage()},
);
sub Usage
{
print <<USAGE;
perl $0 [options] <fasta>
-i INT the identity cutoff 0 to 100 (default: 95)
-l INT the repeat length cutoff (default:0)
-o STRING output filename (default: repeats_coords.txt)
-s STRING output stats filename (default: repeats_stats.txt)
USAGE
exit;
}
my $file=$ARGV[0];
&Usage unless (-e $file);
my $bindir= getBinDirectory();
my $command="$bindir/MUMmer3.23/nucmer --maxmatch --nosimplify --prefix=seq_seq$$ $file $file";
print "Running self-nucmer on $file.\n";
if (system ("$command")) {die "$command"};
# apply identity cutoff and lenght cutoff and use awk to skip self-hits
my $command="$bindir/MUMmer3.23/show-coords -r -I $identity -L $len_cutoff -T seq_seq$$.delta | awk \'\$1 != \$3 || \$2 != \$4 && \$8==\$9 {print}\' > seq_seq$$.coords";
if (system ("$command")) {die "$command"};
&get_coords_file("seq_seq$$.coords");
unlink "seq_seq$$.delta";
unlink "seq_seq$$.coords";
sub get_coords_file
{
my $file=shift;
my %hash;
open (IN,$file);
my $comparison_files=<IN>;
my $tmp=<IN>;
my $field_head=<IN>;
my @comparison_files=split /\s+/,$comparison_files;
my $seq_len = &get_fasta_len($comparison_files[0]);
my %seq_len=%{$seq_len};
my $seq_num=0;
my $total_seq_len=0;
my $total_repeats_len=0;
my $repeats_seg_num=0;
my $repeat_id="r00000";
open (OUT, ">$output");
open (OUT1, ">$stats");
# this is nucmer coordinate output
#[S1] [E1] [S2] [E2] [LEN 1] [LEN 2] [% IDY] [TAGS]
#15376 16742 607219 608585 1367 1367 99.56 gi|49175990|ref|NC_000913.2| gi|49175990|ref|NC_000913.2|
# read through each repeat region and record its position of the seq in hash
while(<IN>){
chomp;
my @fields=split /\t/,$_;
my $seq_id=$fields[7];
my $start=$fields[0];
my $end=$fields[1];
for my $pos ($start..$end){
$hash{$seq_id}->{$pos}=1;
}
}
close IN;
foreach my $seq_id (sort keys %seq_len){
$seq_num++;
my @repeats;
my $seq_total_repeats_len=0;
my $seq_repeats_seg_num=0;
my $len = $seq_len{$seq_id};
for my $pos (1..$len){
if($hash{$seq_id}->{$pos}==1){
$seq_total_repeats_len++;
push @repeats, $pos;
}
else{
if (@repeats){
$seq_repeats_seg_num++;
$repeat_id++;
my $r_start= $repeats[0];
my $r_end= $repeats[-1];
print OUT $seq_id, "\t", $repeat_id, "\t",$r_start,"\t",$r_end, "\t",$r_end-$r_start+1,"\n";
undef @repeats;
}
}
}
if (@repeats){
$seq_repeats_seg_num++;
$repeat_id++;
my $r_start= $repeats[0];
my $r_end= $repeats[-1];
print OUT $seq_id, "\t", $repeat_id, "\t",$r_start,"\t",$r_end, "\t",$r_end-$r_start+1,"\n";
undef @repeats;
}
print OUT1 "\n$seq_id size:\t$len\n";
print OUT1 "Repeats segment #:\t$seq_repeats_seg_num\n";
printf OUT1 ("Repeats total length:\t%d (%.2f%%)\n", $seq_total_repeats_len, $seq_total_repeats_len/$len*100);
$total_repeats_len += $seq_total_repeats_len;
$total_seq_len += $len;
}
close OUT;
if ($seq_num>1){
print OUT1 "\nOVERALL \nTotal seq #:\t$seq_num\n";
print OUT1 " Total bases #:\t$total_seq_len\n";
printf OUT1 (" Repeats total length:\t%d (%.2f%%)\n",$total_repeats_len,$total_repeats_len/$total_seq_len*100);
}
print OUT1;
}
sub get_fasta_len
{
my $file=shift;
my %len;
my $id;
open (INFASTA, $file);
while(<INFASTA>){
chomp;
if (/^>(\S+)/){$id = $1;}
else{
$_=~ s/\n//g;
$_=~ s/ //g;
$len{$id} += length $_;
}
}
close INFASTA;
return \%len;
}
sub getBinDirectory
{
my @t = split '/', "$FindBin::RealBin";
my $path = join '/', @t;
return ($path);
}