-
Notifications
You must be signed in to change notification settings - Fork 0
/
finalDNA.py
123 lines (110 loc) · 4.04 KB
/
finalDNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import fileinput
import copy
#The Overarching pipeline for preprocessing, the algorithm, and post-processing
def solveDNA():
#get input, store length
shorts = [line.strip() for line in fileinput.input()]
#remove all sandwiched shorts
newShorts = filter(lambda x: [x for i in shorts if x in i and x != i] == [], shorts)
#store length
length = len(newShorts)
#generate initial pairings, in kn^2 time
pairs = generatePairings(newShorts)
#greedily merge in order, until all shorts are included in merge
solution = greedySolver(pairs, length)
#Get forrect part of result
result = solution[2]
print result
#The key solution engine of the algorithm,greedily merges pairs until all shorts
#have been accounted for
def greedySolver(pairs, length):
sortedPairs = pairs
#sort pairs by weight
sortedPairs.sort(cmp = lambda x,y: cmp(x[3], y[3]))
sortedPairs.reverse()
#begin with the highest weighted pair
result = copy.copy(sortedPairs[0])
#calculate the best forward and backward merges from that point, or dummy/placeholder edge if finished
Forwards = [x for x in sortedPairs if x[0] == result[1] and x[1] not in result[4]]
if Forwards:
bestForward = Forwards[0]
else:
bestForward = [None,None,None,-1,None]
Backs = [x for x in sortedPairs if x[1] == result[0] and x[0] not in result[4]]
if Backs:
bestBack = Backs[0]
else:
bestBack = [None,None,None,-1,None]
#continuously choose the highest weighted forward or backward merge
#stop only when every short has been included, or when no more
#shorts can be found
while len(result[4]) < length:
print result
#remove from sorted pairs any items sandwiched in result
#sortedPairs = [pair for pair in sortedPairs if pair[2] not in [pair2[2] for pair2 in sortedPairs if pair2 != pair]]
if bestForward[0] == None and bestBack[0] == None:
#print "AAAAAAAAAAAAAAAHHHHHHH"
break
if bestForward[3] > bestBack[3]:
result = mergePairs(result, bestForward)
Forwards = [x for x in sortedPairs if x[0] == result[1] and x[1] not in result[4]]
if Forwards:
bestForward = Forwards[0]
else:
bestForward = [None,None,None,-1,None]
else:
result = mergePairs(bestBack, result)
Backs = [x for x in sortedPairs if x[1] == result[0] and x[0] not in result[4]]
if Backs:
bestBack = Backs[0]
else:
bestBack = [None,None,None,-1,None]
#result is now a greedily chosen merge containing every initial short
#return it
return result
#Merges two pairs, storing the first first item and the last last item, as well as a union of all contained shorts
def mergePairs(prefix, suffix):
print "merging pairs:"
print "pair 1 is - " + prefix[2]
print "pair 2 is - "+ suffix[2]
a = shortestPairing(prefix[2], suffix[2])
if a:
print "merged into" + a[0]
if a[1] >= len(prefix[2]):
return [suffix[0], suffix[1], a[0], a[1], prefix[4] | suffix[4]]
elif a[1] >= len(suffix[2]):
return [prefix[0], prefix[1], a[0], a[1], prefix[4] | suffix[4]]
else:
return [prefix[0], suffix[1], a[0], a[1], prefix[4] | suffix[4]]
#preprocessing to generate all initial pairings
def generatePairings(shorts):
#returns list like (index of prefix, index of suffix, overlapped string, overlap weight)
resultsList = []
shortLen = len(shorts)
for i in range(shortLen):
#resultsList.append([None, i, shorts[i], 0])
#resultsList.append([i, None, shorts[i], 0])
prefix = shorts[i]
for j in range(shortLen):
if j != i:
suffix = shorts[j]
a = shortestPairing(prefix, suffix)
if a:
print [i, j, a[0], a[1], set([i, j])]
resultsList.append([i, j, a[0], a[1], set([i, j])])
# print resultsList
return resultsList
#find the best merge of two strings
def shortestPairing(string1,string2):
#if string1 ends in a prefix of string 2, returns the most overlapping string
#if no overlaps, returns None
#otherwise, returns (string with best overlap, number of overlapping characters)
result = None
l1 = len(string1)
l2= len(string2)
for i in range(min([l1,l2]) + 1):
if string1[l1 - i:] == string2[:i] and string2[:i]:
result = [string1 + string2[i:], i]
return result
#run code
solveDNA()