-
Notifications
You must be signed in to change notification settings - Fork 1
/
count_reads.py
executable file
·59 lines (48 loc) · 2 KB
/
count_reads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python
"""
count_reads.py - Counts total number of supporting reads from Tae-min's
SPUTNIK pipeline.
"""
import sys
from collections import defaultdict
if len(sys.argv) != 2:
print("Usage: %s input_file" % sys.argv[0])
exit(1)
# support is a dictionary of (unit length, number of supporting reads) pairs
# min_reads_for_inclusion: minimum number of supporting reads required to
# consider a unit length in the zygosity determination.
# the simple model is: if there are 0 unit lengths with enough supporting
# reads then no call is made; else if there is only 1 unit length passing
# the inclusion criterion, then call homozygous; else heterozygous.
def call_zygosity(supp_freq, min_reads_for_inclusion=1):
alleles = [ k for k, v in supp_freq.items() if v >= min_reads_for_inclusion ]
if len(alleles) == 0:
return "nocall"
elif len(alleles) == 1:
return "hom"
else:
return "het"
# line format is tab-delimited: index, chrom, start, end, read count list
# read count list is a string of numbers separated by commas. The numbers
# are the repeat unit lengths; each number represents a single read. So a
# string like 7,7,7,8 means 3 reads support an allele with 7 repeat units
# and a single read supports an allele with 8 repeat units.
# We drop the 'index' field.
tot_count = 0
f = open(sys.argv[1], 'r')
out_headers = [ 'chr', 'start', 'end', 'zygosity', 'support.freq' ]
for line in f:
fields = map(str.strip, line.split('\t'))
# header row
if fields[0] == 'index':
#print('\t'.join(out_headers))
continue
if not fields[4]:
# if the read count list is empty, there are no reads supporting
# any repeat length allele. skip these.
continue
# filter(None, seq) drops false values in seq. Empty strings in
# python are "falsey": http://docs.python.org/2/library/stdtypes.html#truth-value-testing
supports = filter(None, map(str.strip, fields[4].split(',')))
tot_count += len(supports)
print(tot_count)