forked from weizhongli/cdhit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_multi_seq.pl
executable file
·76 lines (64 loc) · 1.63 KB
/
make_multi_seq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/perl
#note you have to use "-d 0" in the cd-hit run
#note you better to use "-g 1" in the cd-hit run
# this script read the .clstr file, it generate a seperate fasta file
# for each cluster over certain size
# for example, if you run cd-hit -i db -o dbout -c 0.6 -n 4 -d 0 -g 1
# then you will have a dbout.clstr
# now run this script:
# ./make_multi_seq.pl db dbout.clstr multi-seq 20
#
my $fasta = shift;
my $clstr = shift;
my $out_dir = shift;
my $size_cutoff = shift;
die unless (-e $fasta);
die unless (-e $clstr);
die unless ($out_dir);
$size_cutoff = 1 unless ($size_cutoff);
if (not -e $out_dir) {my $cmd = `mkdir $out_dir`;}
open(TMP, $clstr) || die;
my %id2cid=();
my $cid = "";
my @ids = ();
my $no = 0;
while($ll = <TMP>) {
if ($ll =~ /^>/) {
if ($no >= $size_cutoff) {
foreach $i (@ids) { $id2cid{$i} = $cid;}
}
if ($ll =~ /^>Cluster (\d+)/) { $cid = $1; }
else { die "Wrong format $ll"; }
@ids = ();
$no = 0;
}
else {
if ($ll =~ /(aa|nt), >(.+)\.\.\./) { push(@ids, $2); $no++; }
else { die "Wrong format $ll"; }
}
}
close(TMP);
if ($no >= $size_cutoff) {
foreach $i (@ids) { $id2cid{$i} = $cid;}
}
open(FASTA, $fasta) || die;
my $outfile_open = 0;
my $flag = 0;
while($ll=<FASTA>) {
if ($ll=~ /^>(\S+)/) {
my $id = $1;
my $cid = $id2cid{$id};
if (defined($cid)) {
close(OUT) if ($outfile_open);
open(OUT, ">> $out_dir/$cid") || die "can not open file to write $out_dir/$cid";
$outfile_open = 1;
$flag = 1;
}
else {
$flag = 0;
}
}
if ($flag) {print OUT $ll;}
}
close(FASTA);
close(OUT) if ($outfile_open);