-
Notifications
You must be signed in to change notification settings - Fork 1
/
embl_to_fasta.py
executable file
·67 lines (52 loc) · 1.72 KB
/
embl_to_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
import os
import sys
from pbcore.io import FastaIO
import commands
seq_list = []
base_dir = 'data/'
NCTCname = sys.argv[1]
assembly_name = sys.argv[2]
embl_fl = base_dir+NCTCname+'/'+assembly_name
out_file = base_dir+NCTCname+'/'+NCTCname+"_hgap.fasta"
writer = FastaIO.FastaWriter(out_file)
with open(embl_fl,'r') as f:
f.seek(0, os.SEEK_END)
num_lines = f.tell()
print num_lines
f.seek(0)
while 1:
if f.tell() >= num_lines:
break
line_raw = f.readline()
line = line_raw.split()
if line[0] == 'SQ':
#print "in ses"
seq = ''
next_line = f.readline()
#print f.tell(), num_lines
while next_line.strip() != '//':
seqlist = next_line.split()
seq_str = "".join(seqlist[:-1])
seq += seq_str
next_line = f.readline()
#print f.tell(), num_lines
seq_list.append(seq)
assert len(seq_list) == 1
grep_cmd = 'grep "fasta_record" '+ embl_fl
meta_data = commands.getstatusoutput(grep_cmd)[1].split('\n')
print meta_data, len(meta_data), commands.getstatusoutput(grep_cmd)
if len(meta_data) > 1:
meta_data1 = [x.split()[2] for x in meta_data]
start_end_tup = [(int(x.split('..')[0])-1,int(x.split('..')[-1])) for x in meta_data1]
new_seq_list = []
for tup in start_end_tup:
new_seq_list.append(seq_list[0][tup[0]:tup[1]])
else:
new_seq_list= seq_list
print len(new_seq_list)
for index,seq in enumerate(new_seq_list):
zmw = index + 1
ls = len(seq)
new_header = "m000_000/{zmw}/{start}_{end}".format(zmw=zmw, start=0, end=ls)
writer.writeRecord(new_header, seq)