-
Notifications
You must be signed in to change notification settings - Fork 0
/
calculate_snp_metrics.py
217 lines (192 loc) · 9.86 KB
/
calculate_snp_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/python
import os
import pysam
from deepSNP import *
from collections import OrderedDict
WINDOW_SIZE = 100
def overlaps(read, position):
return read.reference_start <= position and read.reference_end >= position
def main():
parser = argparse.ArgumentParser(
description="Creates a training/testing feature/label mapping from an \
input BAM file using a VCF file containing potential SNP locations")
parser.add_argument('-b', '--bam-file', dest='bam', required=True, help='Input BAM file')
parser.add_argument('-r', '--reference-file', dest='ref', required=True, help='Reference fasta file for input BAM')
parser.add_argument('-v', '--vcf-file', dest='vcf', required=True, help='Input VCF file containing potential SNPs')
parser.add_argument('-t', '--truth-file', dest='truth', required=True, help='Output of wgsim containing ground truth SNPs')
parser.add_argument('-d', '--db-file', dest='db_file', required=True, help='Output Caffe DB file name')
parser.add_argument('-l', '--log', dest='log_level', required=False, default="INFO", help='Set log level (ex:, DEBUG, INFO, WARNING, etc.)')
args = parser.parse_args()
# set logging config
log_level_num = getattr(logging, args.log_level.upper(), None)
if not isinstance(log_level_num, int):
raise ValueError('Invalid log level: %s' % args.log_level)
log = logging.getLogger('deepSNP')
log.setLevel(log_level_num)
in_bam = args.bam
in_vcf = args.vcf
in_ref = args.ref
in_truth = args.truth
db_file = args.db_file
bam_f = pysam.AlignmentFile(in_bam, "rb")
ref_f = pysam.Fastafile(in_ref)
real_snps = OrderedDict()
real_snps_pickle = os.path.splitext(in_truth)[0] + ".pickle"
if os.path.isfile(real_snps_pickle):
log.info("Loading %s",real_snps_pickle)
real_snps = pickle.load(open(real_snps_pickle, "rb"))
else:
#TODO: change get_candidate_snps to name not tied to candidate SNPs, instead VCF
# if we have VCF file, parse VCF
if os.path.splitext(in_truth)[1] == '.vcf':
# get_candidate_snps parses VCF
real_snps = get_candidate_snps(in_truth)
# else assume BED-like file
else:
real_snps = get_real_snps(in_truth)
log.info("Creating file: %s", real_snps_pickle)
pickle.dump(real_snps, open(real_snps_pickle, "wb"))
candidate_snps = OrderedDict()
candidate_snps_pickle = os.path.splitext(in_vcf)[0] + ".pickle"
# get {(chr, pos): (ref, alt)} mapping
if os.path.isfile(candidate_snps_pickle):
log.info("Loading %s", candidate_snps_pickle)
candidate_snps = pickle.load(open(candidate_snps_pickle, "rb"))
else:
candidate_snps = get_candidate_snps(in_vcf)
log.info("Creating file: %s", candidate_snps_pickle)
pickle.dump(candidate_snps, open(candidate_snps_pickle, "wb"))
# total number of reads we see
total_num_reads = 0
# total number of usable reads
total_num_usable_reads = 0
# total number of reads overlapping SNPs
total_num_overlapping_reads = 0
# total number of usable reads overlapping SNPs
total_num_usable_overlapping_reads = 0
start_time = time.time()
num_snps = 0
for location in real_snps:
num_snps += 1
if num_snps % 100000 == 0 and num_snps > 0:
cur_time = time.time()
log.info("Num real SNPs processed: %s (%s percent)", num_snps, float(num_snps)/len(real_snps))
log.info("Elapsed time: %s", cur_time - start_time)
log.info("Avg num reads per window: %s", float(total_num_reads)/float(num_snps))
log.info("Avg num usable reads per window: %s", float(total_num_usable_reads)/float(num_snps))
log.info("Avg num overlapping reads: %s", float(total_num_overlapping_reads)/float(num_snps))
log.info("Avg num usable overlapping reads: %s", float(total_num_usable_overlapping_reads)/float(num_snps))
# location is (chromosome, position) tuple
chromosome = location[0]
pos = int(location[1])
# our feature window is centered on SNP position
# max() makes sure we don't have negative index near start of contig
window_start = max(pos - (WINDOW_SIZE / 2), 0)
window_end = window_start + WINDOW_SIZE
log.debug("location: %s", location)
window_reads = bam_f.fetch(chromosome, window_start, window_end)
if DEBUG:
ref_bases = ref_f.fetch(chromosome, window_start, window_end)
print ref_bases.upper()
# number of reads in this window
num_reads = 0
# number of usable reads in this window
num_usable_reads = 0
# number of reads overlapping SNP
num_overlapping_reads = 0
# number of usable overlapping reads
num_usable_overlapping_reads = 0
for read in window_reads:
num_reads += 1
if overlaps(read, pos):
num_overlapping_reads += 1
# check to make sure read is useful and we haven't hit max num reads
if is_usable_read(read):
#if read.has_tag("MD") and is_usable_read(read):
num_usable_reads += 1
if overlaps(read, pos):
num_usable_overlapping_reads += 1
total_num_reads += num_reads
total_num_usable_reads += num_usable_reads
total_num_overlapping_reads += num_overlapping_reads
total_num_usable_overlapping_reads += num_usable_overlapping_reads
print "##########################################################"
print "## FINAL SUMMARY STATISTICS - REAL SNPS"
print "##########################################################"
print "Total num reads:", total_num_reads
print "Total num usable reads:", total_num_usable_reads
print "Total num overlapping reads:", total_num_overlapping_reads
print "Total num usable overlapping reads:", total_num_usable_overlapping_reads
print "Avg num reads per window:", float(total_num_reads)/float(num_snps)
print "Avg num usable reads per window:", float(total_num_usable_reads)/float(num_snps)
print "Avg num overlapping reads:", float(total_num_overlapping_reads)/float(num_snps)
print "Avg num usable overlapping reads:", float(total_num_usable_overlapping_reads)/float(num_snps)
print "##########################################################"
# total number of reads we see
total_num_reads = 0
# total number of usable reads
total_num_usable_reads = 0
# total number of reads overlapping SNPs
total_num_overlapping_reads = 0
# total number of usable reads overlapping SNPs
total_num_usable_overlapping_reads = 0
start_time = time.time()
num_snps = 0
for location in candidate_snps:
num_snps += 1
if num_snps % 100000 == 0 and num_snps > 0:
cur_time = time.time()
log.info("Num candidate SNPs processed: %s (%s percent)", num_snps, float(num_snps)/len(candidate_snps))
log.info("Elapsed time: %s", cur_time - start_time)
log.info("Avg num reads per window: %s", float(total_num_reads)/float(num_snps))
log.info("Avg num usable reads per window: %s", float(total_num_usable_reads)/float(num_snps))
log.info("Avg num overlapping reads: %s", float(total_num_overlapping_reads)/float(num_snps))
log.info("Avg num usable overlapping reads: %s", float(total_num_usable_overlapping_reads)/float(num_snps))
# location is (chromosome, position) tuple
chromosome = location[0]
pos = int(location[1])
# our feature window is centered on SNP position
# max() makes sure we don't have negative index near start of contig
window_start = max(pos - (WINDOW_SIZE / 2), 0)
window_end = window_start + WINDOW_SIZE
log.debug("location: %s", location)
window_reads = bam_f.fetch(chromosome, window_start, window_end)
if DEBUG:
ref_bases = ref_f.fetch(chromosome, window_start, window_end)
print ref_bases.upper()
# number of reads in this window
num_reads = 0
# number of usable reads in this window
num_usable_reads = 0
# number of reads overlapping SNP
num_overlapping_reads = 0
# number of usable overlapping reads
num_usable_overlapping_reads = 0
for read in window_reads:
num_reads += 1
if overlaps(read, pos):
num_overlapping_reads += 1
# check to make sure read is useful and we haven't hit max num reads
if is_usable_read(read):
#if read.has_tag("MD") and is_usable_read(read):
num_usable_reads += 1
if overlaps(read, pos):
num_usable_overlapping_reads += 1
total_num_reads += num_reads
total_num_usable_reads += num_usable_reads
total_num_overlapping_reads += num_overlapping_reads
total_num_usable_overlapping_reads += num_usable_overlapping_reads
print "##########################################################"
print "## FINAL SUMMARY STATISTICS - CANDIDATE SNPS"
print "##########################################################"
print "Total num reads:", total_num_reads
print "Total num usable reads:", total_num_usable_reads
print "Total num overlapping reads:", total_num_overlapping_reads
print "Total num usable overlapping reads:", total_num_usable_overlapping_reads
print "Avg num reads per window:", float(total_num_reads)/float(num_snps)
print "Avg num usable reads per window:", float(total_num_usable_reads)/float(num_snps)
print "Avg num overlapping reads:", float(total_num_overlapping_reads)/float(num_snps)
print "Avg num usable overlapping reads:", float(total_num_usable_overlapping_reads)/float(num_snps)
print "##########################################################"
if __name__ == "__main__":
main()