forked from sc-zhang/bioscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_filled_gaps.py
executable file
·84 lines (71 loc) · 1.85 KB
/
eval_filled_gaps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
import sys
def search_gaps(seq):
gaps_db = []
cnt_n = 0
for i in range(0, len(seq)):
if seq[i].lower() == 'n':
if cnt_n == 0:
s = i
cnt_n += 1
else:
if cnt_n != 0:
e = i
gaps_db.append([s, e-1])
cnt_n = 0
cnt_n = 0
for region in gaps_db:
cnt_n += region[1]-region[0]+1
return gaps_db, cnt_n
def calc_gaps(seq):
cnt_n = 0
for i in range(0, len(seq)):
if seq[i].lower() == 'n':
cnt_n += 1
return cnt_n
def make_seq_db(in_fasta):
seq_db = {}
with open(in_fasta, 'r') as f_in:
id = ''
seq = ''
for line in f_in:
if line[0] == ">":
if seq != '':
seq_db[id] = seq
id = line.strip()
seq = ''
else:
seq += line.strip()
seq_db[id] = seq
return seq_db
def eval_filled_gaps(ref_fasta, query_fasta, result_file):
print("Reading reference fasta")
ref_seq_db = make_seq_db(ref_fasta)
print("Reading query fasta")
query_seq_db = make_seq_db(query_fasta)
print("Evaluating")
with open(result_file, 'w') as f_out:
for id in ref_seq_db:
ref_gaps_db, ref_gaps_cnt = search_gaps(ref_seq_db[id])
query_gaps_cnt = calc_gaps(query_seq_db[id])
f_out.write(id[1:]+"\n")
if ref_gaps_cnt != 0:
f_out.write("Filled %0.2f%%\n"%((ref_gaps_cnt-query_gaps_cnt)*1.0/ref_gaps_cnt*100.0))
else:
f_out.write("No gaps\n")
if len(ref_gaps_db) != 0:
for region in ref_gaps_db:
s = region[0]
e = region[1]
f_out.write("Region %d-%d:\n%s\n"%(s, e, query_seq_db[id][s:e+1]))
f_out.write("\n")
print("Success")
if __name__ == "__main__":
if len(sys.argv) < 4:
print("Notice: this script is used to evaulate status that gaps been filled")
print("Usage: python "+sys.argv[0]+" <ref_fasta> <query_fasta> <result_file>")
else:
ref_fasta = sys.argv[1]
query_fasta = sys.argv[2]
result_file = sys.argv[3]
eval_filled_gaps(ref_fasta, query_fasta, result_file)