-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_gcbias.py
113 lines (96 loc) · 3.93 KB
/
gen_gcbias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from Bio import SeqIO
import csv
import sys
# loop through all the partitions
# what is the number of A T G C before and after error correction
# for each partition loop through all the reads
# for each read what is the number of A T G C before and after error correction
# 1. get_records, input: partition number
# an array of seqio objects each of which is a read
# 2. make_dictionary_atgc_numbers, input: array of seqio objects
# loops through the reads one by one passing them into dicationary make
# create a dicationary, key is read name, value is a dictionary containing number of A T G and Cs
# an array of seqio objects each of which is a read
def get_records(partition, extension):
handle = open("/home/pa354/c_elegans_data/0001/p"+partition+extension, "rU")
records = list(SeqIO.parse(handle, "fasta"))
handle.close()
return records
def make_reads_dict(records):
dict = {}
for record in records:
record_name = record.id.replace("_consensus", "")
dict[record_name] = record.seq
return dict
def process_reads(partition, before_corr_reads, after_corr_reads):
csv_obj = CsvObj("./gcbias_illumina/"+str(partition)+".csv")
# loop through lines of partition csv
blast_f = open("./make_alignment_csv_output/"+str(partition)+".csv", 'r')
blast_f.readline()
# loop through each read
for line in blast_f:
# grab the indexes from the line
line = line.split(",")
if len(line) < 4:
continue
# get the correct read from the before/after_corr_reads dictionary using line[1] (read header)
# splice the reads according to blast output
# if reads aligns reverse line[8] is larger than line[9] so switch them or this messes up indexing
if int(line[8]) > int(line[9]):
start = line[9]
line[9] = line[8]
line[8] = start
kept_gc = get_read_gc_percentage(before_corr_reads[line[1]][(int(line[8])-1):(int(line[9])-1)])
chucked_read = before_corr_reads[line[1]][0:(int(line[8])-1)] + before_corr_reads[line[1]][(int(line[9])-1):-1]
chucked_gc = get_read_gc_percentage(chucked_read)
output_dict = {'partition':partition, 'read_name':line[1],'kept_gc':kept_gc,'chucked_gc':chucked_gc}
csv_obj.add_line_to_csv(output_dict)
csv_obj.close_csv_handle()
def get_read_gc_percentage(read):
if len(read) == 0:
return None
bases = {'GC': 0, 'others': 0}
for char in read:
if char == "G" or char == "C":
bases['GC'] = bases['GC'] + 1
else:
bases['others'] = bases['others'] + 1
gc_percentage = float(bases['GC'])*100 / (bases['GC']+bases['others'])
return gc_percentage
class CsvObj:
header = []
csvwriter = ''
csvfile = ''
def __init__(self, filename):
self.csvfile = open(filename, 'w+')
self.csvwriter = csv.writer(self.csvfile, delimiter=',')
def add_line_to_csv(self, dictionary):
# if we don't have a header array create one using the keys of the dictionary
if len(self.header) == 0:
self.create_header_array(dictionary)
csv_array = []
for key in self.header:
csv_array.append(dictionary[key])
self.write_line_to_csv(csv_array)
# grab the keys of the dictionary, these will be the header of the csv
def create_header_array(self, dictionary):
header = []
for key in dictionary:
header.append(key)
self.write_line_to_csv(header)
self.header = header
# write the input line to the csv file
def write_line_to_csv(self, csv_array):
self.csvwriter.writerow(csv_array)
def close_csv_handle(self):
self.csvfile.close()
print(sys.argv)
for partition_number in range(int(sys.argv[1]),int(sys.argv[2])):
if partition_number % 20 == 0:
print(partition_number)
partition = '{0:04}'.format(partition_number)
before_corr_records = get_records(partition, "")
after_corr_records = get_records(partition, ".blast6.r.fa")
before_corr_reads = make_reads_dict(before_corr_records)
after_corr_reads = make_reads_dict(after_corr_records)
process_reads(partition, before_corr_reads, after_corr_reads)