-
Notifications
You must be signed in to change notification settings - Fork 0
/
surrounding_5mers.py
99 lines (83 loc) · 3.5 KB
/
surrounding_5mers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from Bio import SeqIO
import csv
import sys
# loop through all the partitions
# what is the number of A T G C before and after error correction
# for each partition loop through all the reads
# for each read what is the number of A T G C before and after error correction
# 1. get_records, input: partition number
# an array of seqio objects each of which is a read
# 2. make_dictionary_atgc_numbers, input: array of seqio objects
# loops through the reads one by one passing them into dicationary make
# create a dicationary, key is read name, value is a dictionary containing number of A T G and Cs
# an array of seqio objects each of which is a read
def get_records(partition, extension):
handle = open("../0001/p"+partition+extension, "rU")
records = list(SeqIO.parse(handle, "fasta"))
handle.close()
return records
def make_reads_dict(records):
dict = {}
for record in records:
record_name = record.id.replace("_consensus", "")
dict[record_name] = record.seq
return dict
def process_reads(partition, before_corr_reads, after_corr_reads):
print("Part "+partition)
csv_obj = CsvObj("./prev_post_csv_output/5mer_"+str(partition)+".csv")
# loop through lines of partition csv
blast_f = open("./make_alignment_csv_output/"+str(partition)+".csv", 'r')
blast_f.readline()
# loop through each read
for line in blast_f:
# grab the indexes from the line
line = line.split(",")
if len(line) < 4:
continue
# get the correct read from the before/after_corr_reads dictionary using line[1] (read header)
# splice the reads according to blast output
fivemer_prev = before_corr_reads[line[1]][(int(line[8])-6):(int(line[8])-1)]
fivemer_post = before_corr_reads[line[1]][int(line[9]):(int(line[9])+5)]
output_dict = {'partition':partition, 'read_name':line[1], 'fivemer_prev': fivemer_prev, 'fivemer_post' : fivemer_post}
csv_obj.add_line_to_csv(output_dict)
csv_obj.close_csv_handle()
def output_line_to_csv(read_name, dict):
csv_string = ','.join("%s=%r" % (key,val) for (key,val) in dict.iteritems())
csv_string = read_name + "," + csv_string
class CsvObj:
header = []
csvwriter = ''
csvfile = ''
def __init__(self, filename):
self.csvfile = open(filename, 'w+')
self.csvwriter = csv.writer(self.csvfile, delimiter=',')
def add_line_to_csv(self, dictionary):
# if we don't have a header array create one using the keys of the dictionary
if len(self.header) == 0:
self.create_header_array(dictionary)
csv_array = []
for key in self.header:
csv_array.append(dictionary[key])
self.write_line_to_csv(csv_array)
# grab the keys of the dictionary, these will be the header of the csv
def create_header_array(self, dictionary):
header = []
for key in dictionary:
header.append(key)
self.write_line_to_csv(header)
self.header = header
# write the input line to the csv file
def write_line_to_csv(self, csv_array):
self.csvwriter.writerow(csv_array)
def close_csv_handle(self):
self.csvfile.close()
print(sys.argv)
for partition_number in range(int(sys.argv[1]),int(sys.argv[2])):
if partition_number % 20 == 0:
print(partition_number)
partition = '{0:04}'.format(partition_number)
before_corr_records = get_records(partition, "")
after_corr_records = get_records(partition, ".blast6.r.fa")
before_corr_reads = make_reads_dict(before_corr_records)
after_corr_reads = make_reads_dict(after_corr_records)
process_reads(partition, before_corr_reads, after_corr_reads)