forked from sc-zhang/bioscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SimCollapse.py
executable file
·149 lines (125 loc) · 4.39 KB
/
SimCollapse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
import sys
import argparse
import random
def GetOpts():
group = argparse.ArgumentParser()
group.add_argument('-a', '--a_contigs', help='first fasta file contain contigs generated by SimContigs.py', required=True)
group.add_argument('-b', '--b_contigs', help='second fasta file contain contigs generated by SimContigs.py', required=True)
group.add_argument('-p', '--prefix', help='prefix of contig file a and contig file b, divided by comma, like: HA, HB', required=True)
group.add_argument('-o', '--output', help='filename of simulated data', required=True)
group.add_argument('-s', '--blast', help='blast file with format 6, must use first file of input as query and second file as database', required=True)
group.add_argument('-c', '--collapse', type=float, help='persentage of collapse region size, like 5 means 5%%, default: 10', default=10)
return group.parse_args()
def ReadFasta(inFasta):
fastaDB = {}
with open(inFasta, 'r') as fIn:
id = ''
seq = ''
totalLen = 0
for line in fIn:
if line[0] == '>':
if seq != '':
fastaDB[id] = seq
data = line.strip()[1:].split()
id = data[0]
seq = ''
else:
seq += line.strip()
totalLen += len(line.strip())
fastaDB[id] = seq
return fastaDB, totalLen
def ReadBlast(inBlast, prefixList):
blastDB = {}
with open(inBlast, 'r') as fBlast:
for line in fBlast:
data = line.strip().split()
queryID = data[0]
targetID = data[1]
identity = float(data[2])
queryRegion = list(map(int, [data[6], data[7]]))
targetRegion = list(map(int, [data[8], data[9]]))
if queryID not in blastDB:
blastDB[queryID] = [targetID, identity, queryRegion, targetRegion]
else:
if identity > blastDB[queryID][1]:
blastDB[queryID] = [targetID, identity, queryRegion, targetRegion]
mapping = {}
allContigList = []
for queryID in blastDB:
targetID = blastDB[queryID][0]
mapping[prefixList[0]+'-'+queryID] = prefixList[1]+"-"+targetID
mapping[prefixList[1]+"-"+targetID] = prefixList[0]+'-'+queryID
allContigList.append(prefixList[0]+'-'+queryID)
allContigList.append(prefixList[1]+"-"+targetID)
return mapping, allContigList
def SimCollapse(aFasta, bFasta, outFa, blastFile, prefixList, collapse):
print("Reading first contig file")
fastaDBA, lenA = ReadFasta(aFasta)
print("Reading second contig file")
fastaDBB, lenB = ReadFasta(bFasta)
lenDB = {}
for id in fastaDBA:
lenDB[prefixList[0]+"-"+id] = len(fastaDBA[id])
for id in fastaDBB:
lenDB[prefixList[1]+"-"+id] = len(fastaDBB[id])
print("Reading blast file")
mapping, allContigList = ReadBlast(blastFile, prefixList)
collapseLen = int((lenA+lenB)*collapse)
print("Total collapse size expected: %d"%(collapseLen))
removeList = {}
print("Removing collapse regions")
removeLen = 0
while collapseLen > 0:
index = random.randint(0, len(allContigList)-1)
name = allContigList[index]
while mapping[name] not in allContigList:
index = random.randint(0, len(allContigList)-1)
name = allContigList[index]
repeatCnt = 0
isLast = False
while mapping[name] not in allContigList or lenDB[name] > collapseLen:
if repeatCnt > 50:
isLast = True
break
if lenDB[name] > collapseLen:
repeatCnt += 1
index = random.randint(0, len(allContigList)-1)
name = allContigList[index]
if isLast:
break
pre, ctg = name.split('-')
collapseLen -= lenDB[name]
removeLen += lenDB[name]
if pre not in removeList:
removeList[pre] = []
removeList[pre].append(ctg)
allContigList.remove(name)
allContigList.remove(mapping[name])
print("Total collapse size removed: %d"%(removeLen))
print("Writing result")
with open(outFa, 'w') as fOut:
for pre in removeList:
if pre == prefixList[0]:
for id in fastaDBA:
if id not in removeList[pre]:
fOut.write(">%s_%s\n%s\n"%(pre, id, fastaDBA[id]))
else:
for id in fastaDBB:
if id not in removeList[pre]:
fOut.write(">%s_%s\n%s\n"%(pre, id, fastaDBB[id]))
print("Success")
if __name__ == "__main__":
opts = GetOpts()
aFasta = opts.a_contigs
bFasta = opts.b_contigs
outFa = opts.output
collapse = opts.collapse/100.0
blastFile = opts.blast
prefixList = opts.prefix.split(',')
print("Arguments")
print("\tInput files: %s, %s"%(aFasta, bFasta))
print("\tOutput file: %s"%(outFa))
print("\tBlast file: %s"%(blastFile))
print("\tCollapse ratio: %.2f%%"%(collapse*100))
SimCollapse(aFasta, bFasta, outFa, blastFile, prefixList, collapse)