-
Notifications
You must be signed in to change notification settings - Fork 2
/
mtgrasp_summarize.py
executable file
·127 lines (102 loc) · 4.96 KB
/
mtgrasp_summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
'''
This script can be used to summarize mtGrasp assembly outputs by providing a text file containing the relative or complete path(s) to assembly output folder(s).
'''
mtgrasp_version = 'v1.1.8'
import argparse
import os
import os.path
import pandas as pd
import sys
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input',help='Input text file containing the path(s) to assembly output folder(s)',required=True)
parser.add_argument('-p','--prefix',help='Prefix for output files',required=True)
args = parser.parse_args()
input = args.input
prefix = args.prefix
tsv_filename = f'{prefix}_mtgrasp_{mtgrasp_version}_assembly_summary.tsv'
path_txt_filename = f'{prefix}_mtgrasp_{mtgrasp_version}_path_to_output.txt'
# Count the number of contigs, Ns per 1000bp, and number of bases, length of longest contig, standardization status, etc. in an assembly output fasta
def get_assembly_metrics(fasta):
seq_count = 0
num_bases = 0
total_n_count = 0 #Ns per 1000bp
with open(fasta, 'r') as f:
seq_len_list = []
for line in f:
line = line.strip()
if line.startswith('>'):
fasta_header = line
else:
seq = line.upper()
seq_len = len(seq)
seq_len_list.append(seq_len)
num_bases += seq_len
total_n_count += seq.count("N") # number of Ns per fasta sequence
seq_count += 1
if num_bases != 0:
n_count_per_1000 = total_n_count /num_bases/ 1000
else:
n_count_per_1000 = 0
max_seq_len = max(seq_len_list)
if seq_count == 0:
standardization_status = 'N/A'
circle_check = 'N/A'
elif seq_count == 1:
if "Circular" in fasta_header:
circle_check = 'Circular'
else:
circle_check = 'Linear'
if "StartSite" in fasta_header and "Strand" in fasta_header:
standardization_status = "StartSite_Strand_Standardized"
elif "StartSite" in fasta_header and "Strand" not in fasta_header:
standardization_status = "StartSite_Standardized"
elif "StartSite" not in fasta_header and "Strand" in fasta_header:
standardization_status = "Strand_Standardized"
else:
standardization_status = "Non-Standardized"
else:
# Circularization and standardization are only conducted for one-piece assemblies to avoid potentially introducing misassemblies
circle_check = 'Linear'
standardization_status = "Non-Standardized"
return n_count_per_1000, seq_count, num_bases, max_seq_len, circle_check, standardization_status
assembly_list, n_count_list, seq_count_list, num_bases_list, max_seq_list, circle_check_list, standardization_status_list, file_list = [], [], [], [], [], [], [], []
with open(input, 'r') as dirs:
for outdir in dirs:
outdir = outdir.strip()
if not os.path.exists(outdir):
print(f"Error: mtGrasp output directory {outdir} does not exist")
directory = "%s/final_output"%(outdir)
dir_exists=os.path.exists(directory)
if dir_exists:
for dir in os.listdir(directory):
fasta = os.path.abspath("%s/%s/%s.final-mtgrasp_%s-assembly.fa"%(directory,dir,dir, mtgrasp_version)) # Complete path to output fasta file
# check if fasta output file exists
if os.path.exists(fasta) and any(line.startswith('>') for line in open(fasta)):
n_count_list.append(get_assembly_metrics(fasta)[0])
seq_count_list.append(get_assembly_metrics(fasta)[1])
num_bases_list.append(get_assembly_metrics(fasta)[2])
max_seq_list.append(get_assembly_metrics(fasta)[3])
circle_check_list.append(get_assembly_metrics(fasta)[4])
standardization_status_list.append(get_assembly_metrics(fasta)[5])
else:
n_count_list.append(0)
seq_count_list.append(0)
num_bases_list.append(0)
max_seq_list.append(0)
circle_check_list.append('N/A')
standardization_status_list.append('N/A')
assembly_list.append(dir)
file_list.append(fasta)
df = pd.DataFrame({"Assembly": assembly_list,
"Ns per 1000bp": n_count_list,
"Number of Contigs": seq_count_list,
"Total Number of Base Pairs Per Assembly": num_bases_list,
"Length of the Longest Contig (bp)": max_seq_list,
"Circular or Linear": circle_check_list,
"Standardization Status": standardization_status_list
})
df.to_csv(tsv_filename, sep="\t", index=False)
with open(path_txt_filename, 'w') as f:
for path in file_list:
f.write(f"{path}\n")