diff --git a/README.md b/README.md
index 48dd0c7..9d4eae7 100755
--- a/README.md
+++ b/README.md
@@ -40,8 +40,25 @@ git clone https://github.com/bcgsc/mtGrasp.git
* ntCard
---
+### Installation Instructions for Dependencies
+#### Step 1:
-#### Special Installation Instructions for MITOS
+Recommended (Faster):
+```
+conda create -n mtgrasp python=3.10 mamba
+conda activate mtgrasp
+mamba install -c conda-forge -c bioconda snakemake 'blast>=2.10.0' biopython seqtk abyss ntjoin bwa samtools pilon ntcard
+```
+
+Alternative (Slower):
+```
+conda create -n mtgrasp python=3.10
+conda activate mtgrasp
+conda install -c conda-forge -c bioconda snakemake 'blast>=2.10.0' biopython seqtk abyss ntjoin bwa samtools pilon ntcard
+```
+
+
+#### Step 2: Special Installation Instructions for MITOS
As MITOS uses an older Python version, please install it in a new conda environment called "mitos" using the instructions below:
@@ -121,9 +138,9 @@ However, if such sequences are unavailable, you can move up the taxonomic hierar
`-b` or `--sealer_k=STRING`: k-mer size(s) used in sealer gap filling ['60,80,100,120']
-`-e` or `--end_recov_sealer_fpr=N`: False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]
+`-sf` or `--end_recov_sealer_fpr=N`: False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]
-`-v` or `--end_recov_sealer_k=STRING`: k-mer size used in sealer flanking end recovery ['60,80,100,120']
+`-sk` or `--end_recov_sealer_k=STRING`: k-mer size used in sealer flanking end recovery ['60,80,100,120']
`-i` or `--end_recov_p=N`: Merge at most N alternate paths during sealer flanking end recovery; use 'nolimit' for no limit [5]
@@ -131,6 +148,7 @@ However, if such sequences are unavailable, you can move up the taxonomic hierar
`-ma` or `--mismatch_allowed=N`: Maximum number of mismatches allowed while determining the overlapping region between the two ends of the mitochondrial assembly [1]
+`-v` or `--version`: Print out the version of mtGrasp and exit
---
@@ -193,28 +211,6 @@ If you are not interested in the standardized mitogenome sequence(s) or MITOS an
-### Summarize mtGrasp results
-
-```
-mtgrasp_summarize.py -i -t -l
-```
-Here, this script will summarize the mtGrasp results for all assembly output folders listed in the input text file ` `. The output tsv file `` will contain the following columns:
-
-`Assembly`: the name of the assembly output folder along with the k and kc values used for the assembly
-
-`Number of sequences`: the number of mitochondrial sequences generated by mtGrasp
-
-`Scaffold lengths`: the length(s) of the mitochondrial sequence(s) generated by mtGrasp
-
-`Standardization status`: whether the mitochondrial sequence(s) generated by mtGrasp is standardized or not
-
-`Number of gaps (pre-GapFilling)`: the number of gaps in the mitochondrial sequence(s) generated by mtGrasp before gap filling
-
-`Number of gaps (post-GapFilling)`: the number of gaps in the mitochondrial sequence(s) generated by mtGrasp after gap filling
-
-
-Finally, `` will contain the full path(s) to the assembly output fasta file(s)
-
---
### Standardize your own mitochondrial sequence(s) using mtGrasp
If you have your own mitochondrial sequence(s) and would like to standardize it/them using mtGrasp, you can do so by using mtGrasp's `mtgrasp_standardize.py` script.
@@ -263,5 +259,30 @@ Because mtGrasp annotation uses a third-party tool called [MITOS](https://www.sc
+---
+
+### Summarize mtGrasp results
+
+```
+mtgrasp_summarize.py -i -p
+```
+
+Here, this script will summarize the mtGrasp results for all assembly output folders listed in the input text file ` `. The output tsv file `{prefix}_mtgrasp_{mtgrasp_version}_assembly_summary.tsv'` will contain the following columns:
+
+`Assembly`: the name of the assembly output folder along with the k and kc values (number of read pairs if `-sub` is enabled) used for the assembly
+
+`Ns per 1000bp`: the number of Ns per 1000bp in the mitochondrial sequence(s) generated by mtGrasp
+
+`Number of Contigs`: the number of mitochondrial sequences generated by mtGrasp
+
+`Total Number of Base Pairs Per Assembly`: the total number of base pairs in the mitochondrial sequence(s) generated by mtGrasp
+
+`Length of the Lonest Contig (bp)`: the length of the longest mitochondrial sequence generated by mtGrasp
+
+`Circular or Linear`: whether the mitochondrial sequence(s) generated by mtGrasp is circular or linear
+
+`Standardization Status`: whether the mitochondrial sequence(s) generated by mtGrasp is start-site standardized and strand standardized
+
+Finally, `{prefix}_mtgrasp_{mtgrasp_version}_path_to_output.txt` will contain the complete path(s) to the assembly output fasta file(s)
diff --git a/create_references_for_ntjoin.py b/create_references_for_ntjoin.py
new file mode 100755
index 0000000..9da0754
--- /dev/null
+++ b/create_references_for_ntjoin.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+# Split a fasta file into multiple files, one per sequence and create a reference config file for ntJoin
+# Usage: split_fasta.py
+
+import sys
+
+ntjoin_reference_weight = 2 #Refer to ntJoin README for more info: https://github.com/bcgsc/ntJoin
+input = sys.argv[1]
+output = sys.argv[2]
+config = sys.argv[3]
+ref_list = []
+with open(input, 'r') as f:
+ for line in f:
+ line = line.strip()
+ if line.startswith('>'):
+ header = line
+ filename = header.split(' ')[0].strip('>') + '.fasta'
+ else:
+ sequence = line
+ with open(output + '/' + filename, 'w') as g:
+ g.write(header + '\n')
+ g.write(sequence + '\n')
+ ref_list.append(f'{output}/{filename},{ntjoin_reference_weight}')
+
+with open(config, 'w') as f:
+ f.write('\n'.join(ref_list))
+
+
diff --git a/mtgrasp.py b/mtgrasp.py
index 2a4e3d3..94f9aac 100755
--- a/mtgrasp.py
+++ b/mtgrasp.py
@@ -4,6 +4,7 @@
import argparse
import subprocess
import shlex
+mtgrasp_version = 'mtGrasp v1.0.0' # Make sure to edit the version for future releases
parser = argparse.ArgumentParser(description='Usage of mtGrasp')
parser.add_argument('-r1', '--read1', help='Forward read fastq.gz file', required=True)
@@ -19,8 +20,8 @@
parser.add_argument('-s', '--sealer_fpr', help='False positive rate for the bloom filter used by sealer during gap filling [0.01]', default = 0.01)
parser.add_argument('-p', '--gap_filling_p', help='Merge at most N alternate paths during sealer gap filling step [5]', default = 5)
parser.add_argument('-b', '--sealer_k', help='k-mer size used in sealer gap filling [60,80,100,120]', default = '60,80,100,120')
-parser.add_argument('-e', '--end_recov_sealer_fpr', help='False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]', default = 0.01)
-parser.add_argument('-v', '--end_recov_sealer_k', help='k-mer size used in sealer flanking end recovery [60,80,100,120]', default = '60,80,100,120')
+parser.add_argument('-sf', '--end_recov_sealer_fpr', help='False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]', default = 0.01)
+parser.add_argument('-sk', '--end_recov_sealer_k', help='k-mer size used in sealer flanking end recovery [60,80,100,120]', default = '60,80,100,120')
parser.add_argument('-i', '--end_recov_p', help='Merge at most N alternate paths during sealer flanking end recovery [5]', default = 5)
parser.add_argument('-u', '--unlock', help='Remove a lock implemented by snakemake on the working directory', action='store_true')
parser.add_argument('-ma', '--mismatch_allowed', help='Maximum number of mismatches allowed while determining the overlapping region between the two ends of the mitochondrial assembly [1]', default = 1)
@@ -28,6 +29,7 @@
parser.add_argument('-nsub', '--nosubsample', help='Run mtGrasp using the entire read dataset without subsampling [False]', action='store_true')
parser.add_argument('-an', '--annotate', help='Run gene annotation on the final assembly output [False]', action='store_true')
parser.add_argument('-d', '--delete', help='Delete intermediate subdirectories/files once mtGrasp reaches completion [False]', action='store_true')
+parser.add_argument('-v', '--version', action="version", version=mtgrasp_version)
@@ -57,6 +59,7 @@
+
# get the directory of the mtgrasp.smk script
string = subprocess.check_output(['which', 'mtgrasp.smk'])
string = string.decode('utf-8')
@@ -96,6 +99,7 @@
{read1_base} {read2_base} {script_dir} {threads} {mt_gen} {kmer} \
{kc} {ref_path} {abyss_fpr} {sealer_fpr} {p} {sealer_k} \
{end_recov_sealer_fpr} {end_recov_p} {end_recov_sealer_k} {mismatch_allowed} 'No' "))
+
else:
print('Please double check mtGrasp usage information')
exit(1)
diff --git a/mtgrasp.smk b/mtgrasp.smk
index 64de3d4..be4ed39 100755
--- a/mtgrasp.smk
+++ b/mtgrasp.smk
@@ -1,4 +1,7 @@
# Snakemake file for mtGrasp pipeline
+# Make sure to edit the version for new releases
+mtgrasp_version = 'v1.0.0'
+
import os.path
import shlex
@@ -19,7 +22,7 @@ else:
# Start of the pipeline
rule all:
input:
- expand(current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp-assembly.fa", library = config["out_dir"], k = config["k"], kc = config["kc"])
+ expand(current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp_%s-assembly.fa"%(mtgrasp_version), library = config["out_dir"], k = config["k"], kc = config["kc"])
@@ -186,8 +189,8 @@ rule create_lists:
input:
rules.blast.output
output:
- ref_list=current_dir + "{library}/blast/k{k}_kc{kc}-query.txt",
- query_list=current_dir + "{library}/blast/k{k}_kc{kc}-ref.txt"
+ ref_list=current_dir + "{library}/blast/k{k}_kc{kc}-ref.txt",
+ query_list=current_dir + "{library}/blast/k{k}_kc{kc}-query.txt"
benchmark:
current_dir + "{library}/benchmark/k{k}_kc{kc}.create_lists.benchmark.txt"
shell:
@@ -202,14 +205,18 @@ rule extract_seq:
assemblies=rules.select_length.output
output:
query_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-scaffolds.blast-mt_db.fa",
- ref_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-ref.fa"
+ ref_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-ref.fa",
benchmark:
current_dir + "{library}/benchmark/k{k}_kc{kc}.extract_seq.benchmark.txt"
params:
- ref_fasta=config["ref_path"]
+ ref_fasta=config["ref_path"],
+ ref_outdir=current_dir + "{library}/blast/mito_refs",
+ ref_config = current_dir + "{library}/blast/k{k}_kc{kc}-ntjoin_ref_config.csv"
shell:
"seqtk subseq {input.assemblies} {input.query} > {output.query_out} ; "
- "seqtk subseq {params.ref_fasta} {input.ref} > {output.ref_out}"
+ "seqtk subseq {params.ref_fasta} {input.ref} > {output.ref_out} ;"
+ " mkdir -p {params.ref_outdir} && create_references_for_ntjoin.py {output.ref_out} {params.ref_outdir} {params.ref_config}"
+
#Prepare for polishing: ntJoin+Sealer or Sealer
@@ -228,19 +235,18 @@ rule pre_polishing:
sealer_fpr=config['sealer_fpr'],
threads=config['threads'],
p=config['p'],
- k=config['sealer_k']
- benchmark:
- current_dir + "{library}/benchmark/k{k}_kc{kc}.pre_polishing.benchmark.txt"
- log:
+ k=config['sealer_k'],
+ ref_config = current_dir + "{library}/blast/k{k}_kc{kc}-ntjoin_ref_config.csv",
sealer=current_dir + "{library}/prepolishing/k{k}_kc{kc}_sealer.log",
ntjoin=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}_ntjoin.log"
-
+ benchmark:
+ current_dir + "{library}/benchmark/k{k}_kc{kc}.pre_polishing.benchmark.txt"
run:
import os
target = os.path.basename(input.target)
ref = os.path.basename(input.ref)
- log_ntjoin = os.path.basename(log.ntjoin)
- log_sealer = log.sealer
+ log_ntjoin = os.path.basename(params.ntjoin)
+ log_sealer = params.sealer
bf = bf_sealer(params.r1, params.r2, wildcards.library, params.threads, params.sealer_fpr,params.k)
count = sum(1 for line in open(input[0]))
k = k_string_converter(params.k)
@@ -249,7 +255,7 @@ rule pre_polishing:
if count == 0:
print(f"Input file {input.target} is empty, no mitochondrial sequence found.")
exit(1)
- if num_gaps != 0 and count == 2:
+ elif num_gaps != 0 and count == 2:
print("---Start Sealer Gap Filling---")
print("One-piece contig found, no ntJoin scaffolding needed")
shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {input.target} {params.r1} {params.r2} &> {log_sealer}""")
@@ -261,9 +267,15 @@ rule pre_polishing:
# If multiple contigs are found, both ntJoin and Sealer are needed
else:
- print("Multiple contigs found, ntJoin scaffolding and Gap-filling are both needed")
- shell("""bash run_ntjoin.sh {params.workdir} {target} {ref} {log_ntjoin} {params.threads}""")
- shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {params.ntjoin_out} {params.r1} {params.r2} &> {log_sealer}""")
+ print("Multiple contigs found, ntJoin scaffolding starts")
+ shell("""run_ntjoin.sh {params.workdir} {target} {params.ref_config} {log_ntjoin} {params.threads}""")
+ # check gaps need to be filled or not post-ntJoin
+ if check_gaps(params.ntjoin_out) == 0:
+ print("---No Gaps Found After ntJoin, Gap Filling Not Needed---")
+ shell("cp {params.ntjoin_out} {output}")
+ else:
+ print("---Gap Found After ntJoin, Start Sealer Gap Filling---")
+ shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {params.ntjoin_out} {params.r1} {params.r2} &> {log_sealer}""")
@@ -348,7 +360,7 @@ rule standardization:
input:
rules.end_recovery.output
output:
- current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp-assembly.fa"
+ current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp_%s-assembly.fa"%(mtgrasp_version)
benchmark:
current_dir + "{library}/benchmark/k{k}_kc{kc}.standardization.benchmark.txt"
params:
diff --git a/mtgrasp_standardize.py b/mtgrasp_standardize.py
index ce01398..ed654f8 100755
--- a/mtgrasp_standardize.py
+++ b/mtgrasp_standardize.py
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
+# Make sure to edit the version for new releases
+mtgrasp_version = 'v1.0.0'
+
import argparse
import sys
import shlex
@@ -187,7 +190,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
if os.stat(file).st_size == 0:
print('File is empty, no start-site standardization needed')
# write an empty fasta file
- with open('%s/%s.final-mtgrasp-assembly.fa'%(output_dir,sample), 'w') as f:
+ with open('%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir,sample, mtgrasp_version), 'w') as f:
f.write('')
@@ -197,7 +200,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
elif num_of_assemblies > 1 and scenario_in_file == False:
print('Multiple contigs in the fasta file, standardization cannot be performed')
# write the original fasta file to the output directory
- with open('%s/%s.final-mtgrasp-assembly.fa'%(output_dir,sample), 'w') as f:
+ with open('%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir,sample, mtgrasp_version), 'w') as f:
for record in SeqIO.parse(file, "fasta"):
f.write('>Non-Standardized_%s_seq\n' % (sample) + str(record.seq) + '\n')
# if the file has only one sequence or has 2 sequences (but 'Scenario' is found in file)
@@ -271,7 +274,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
standardized_seq = remove_duplicates_in_a_list(standardized_seq)
- file_name = '%s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample)
+ file_name = '%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version)
# Store the results of the function and file reading in variables
@@ -301,7 +304,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
# Run MitoS annotation on the final assembly fasta file if '-a' argument is provided
if annotate:
- print('Start Annotating %s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample))
+ print('Start Annotating %s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version))
if not os.path.exists(f'{output_dir}/annotation_output'):
cmd1 = f'mkdir -p {output_dir}/annotation_output'
cmd_shlex1 = shlex.split(cmd1)
@@ -310,7 +313,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
try:
- output = run_mitos("mitos", '%s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample), mito_gencode,f'{output_dir}/annotation_output', script_dir)
+ output = run_mitos("mitos", '%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version), mito_gencode,f'{output_dir}/annotation_output', script_dir)
print(f"Output: {output}")
except Exception as e:
print(f"Error: {e}")
diff --git a/mtgrasp_summarize.py b/mtgrasp_summarize.py
index f8c0cca..1350eba 100755
--- a/mtgrasp_summarize.py
+++ b/mtgrasp_summarize.py
@@ -1,120 +1,127 @@
#!/usr/bin/env python3
-# This script generates a summary of the assembly outputs (including the number of one-pieces, the maximum scaffold length in each assembly, and the assembly name)
-# and writes the list of path to assembly output files to a text file.
-
-
+'''
+This script can be used to summarize mtGrasp assembly outputs by providing a text file containing the relative or complete path(s) to assembly output folder(s).
+'''
+mtgrasp_version = 'v1.0.0' # Make sure to edit the version for future releases
import argparse
import os
import os.path
-from Bio import SeqIO
import pandas as pd
import sys
-
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input',help='Input text file containing the path(s) to assembly output folder(s)',required=True)
-parser.add_argument('-t','--output',help='Output tsv filename',required=True)
-parser.add_argument('-l','--pathlist',help='Output file consisting of the full path(s) to the assembly output file(s)',required=True)
+parser.add_argument('-p','--prefix',help='Prefix for output files',required=True)
args = parser.parse_args()
input = args.input
-output = args.output
-pathlist = args.pathlist
-
-
-
-def summarize_outputs(input, output, pathlist):
- # report the sequence lengths of each scaffold in a fasta file
- def count_seq_lengths(file):
- lengths = []
- with open(file, "r") as f:
- for line in f:
- if line.startswith(">"):
- continue
- else:
- lengths.append(len(line.strip()))
- return ",".join([str(x) for x in lengths])
-
-# report the standardization status of each assembly
- def extract_headers(file):
- headers = []
- with open(file, "r") as f:
- for line in f:
- if line.startswith(">"):
- line = line.strip()[1:] #remove >
- headers.append(line)
- return ",".join(headers)
-
-
- assembly = []
- number_of_sequences = []
- scaffold_lengths = []
- file_list =[]
- gap_remaining = []
- pre_sealer_gap = []
- sealer_prefix = []
- status = []
-
- file = open(input, 'r')
- lines = file.read().splitlines()
-
-
- for line in lines:
- #summarize sealer log files
- dir_sealer = "%s/prepolishing"%(line)
- dir_sealer_exists=os.path.isdir(dir_sealer)
- if dir_sealer_exists == True:
- for f in os.listdir(dir_sealer):
- f_basename = os.path.basename(f)
- if f_basename.endswith("_log.txt") and os.stat("%s/%s"%(dir_sealer,f)).st_size != 0:
- sealer_prefix.append("%s_%s"%(os.path.basename(line),'_'.join(f_basename.split('.')[0].split('_')[:-1])))
- with open("%s/%s"%(dir_sealer,f)) as file_in:
- ls = []
- for l in file_in:
- ls.append(l)
- gap_remaining.append(int(float(ls[1].split(' ')[0]) - float(ls[-3].split(' ')[3])))
- pre_sealer_gap.append(int(float(ls[1].split(' ')[0])))
- else:
- print("Error: %s does not exist"%(dir_sealer))
- continue
-
-
- #summarize assembly outputs
- directory = "%s/final_output"%(line)
+prefix = args.prefix
+
+tsv_filename = f'{prefix}_mtgrasp_{mtgrasp_version}_assembly_summary.tsv'
+path_txt_filename = f'{prefix}_mtgrasp_{mtgrasp_version}_path_to_output.txt'
+
+
+# Count the number of contigs, Ns per 1000bp, and number of bases, length of longest contig, standardization status, etc. in an assembly output fasta
+def get_assembly_metrics(fasta):
+ seq_count = 0
+ num_bases = 0
+ total_n_count = 0 #Ns per 1000bp
+ with open(fasta, 'r') as f:
+ seq_len_list = []
+ for line in f:
+ line = line.strip()
+ if line.startswith('>'):
+ fasta_header = line
+ else:
+ seq = line.upper()
+ seq_len = len(seq)
+ seq_len_list.append(seq_len)
+ num_bases += seq_len
+ total_n_count += seq.count("N") # number of Ns per fasta sequence
+ seq_count += 1
+
+ if num_bases != 0:
+ n_count_per_1000 = total_n_count /num_bases/ 1000
+ else:
+ n_count_per_1000 = 0
+ max_seq_len = max(seq_len_list)
+ if seq_count == 0:
+ standardization_status = 'N/A'
+ circle_check = 'N/A'
+ elif seq_count == 1:
+ if "Circular" in fasta_header:
+ circle_check = 'Circular'
+ else:
+ circle_check = 'Linear'
+
+ if "StartSite" in fasta_header and "Strand" in fasta_header:
+ standardization_status = "StartSite_Strand_Standardized"
+ elif "StartSite" in fasta_header and "Strand" not in fasta_header:
+ standardization_status = "StartSite_Standardized"
+ elif "StartSite" not in fasta_header and "Strand" not in fasta_header:
+ standardization_status = "StartSite_Standardized"
+ else:
+ standardization_status = "Non-Standardized"
+ else:
+ # Circularization and standardization are only conducted for one-piece assemblies to avoid potentially introducing misassemblies
+ circle_check = 'Linear'
+ standardization_status = "Non-Standardized"
+
+
+ return n_count_per_1000, seq_count, num_bases, max_seq_len, circle_check, standardization_status
+
+assembly_list, n_count_list, seq_count_list, num_bases_list, max_seq_list, circle_check_list, standardization_status_list, file_list = [], [], [], [], [], [], [], []
+
+with open(input, 'r') as dirs:
+ for outdir in dirs:
+ outdir = outdir.strip()
+ if not os.path.exists(outdir):
+ print(f"Error: mtGrasp output directory {outdir} does not exist")
+ directory = "%s/final_output"%(outdir)
dir_exists=os.path.exists(directory)
- if dir_exists == True:
+
+ if dir_exists:
for dir in os.listdir(directory):
- fasta = os.path.abspath("%s/%s/%s.final-mtgrasp-assembly.fa"%(directory,dir,dir))
- file_list.append(fasta)
+
+ fasta = os.path.abspath("%s/%s/%s.final-mtgrasp_%s-assembly.fa"%(directory,dir,dir, mtgrasp_version)) # Complete path to output fasta file
+ # check if fasta output file exists
+ if os.path.exists(fasta) and any(line.startswith('>') for line in open(fasta)):
+ n_count_list.append(get_assembly_metrics(fasta)[0])
+ seq_count_list.append(get_assembly_metrics(fasta)[1])
+ num_bases_list.append(get_assembly_metrics(fasta)[2])
+ max_seq_list.append(get_assembly_metrics(fasta)[3])
+ circle_check_list.append(get_assembly_metrics(fasta)[4])
+ standardization_status_list.append(get_assembly_metrics(fasta)[5])
+ else:
+ n_count_list.append(0)
+ seq_count_list.append(0)
+ num_bases_list.append(0)
+ max_seq_list.append(0)
+ circle_check_list.append('N/A')
+ standardization_status_list.append('N/A')
+
- assembly.append(dir)
- n = len([1 for l in open(fasta) if l.startswith(">")])
- number_of_sequences.append(n)
- scaffold_lengths.append(count_seq_lengths(fasta))
- status.append(extract_headers(fasta))
- else:
- print("Error: %s does not exist"%(directory))
- continue
-
-
-
- df_assm = pd.DataFrame({'Assembly':assembly, 'Number of sequences':number_of_sequences, 'Scaffold lengths':scaffold_lengths, 'Standardization status':status})
- df_sealer = pd.DataFrame({'Assembly':sealer_prefix, 'Number of gaps (pre-GapFilling)':pre_sealer_gap, 'Number of gaps (post-GapFilling)':gap_remaining})
-
- df = pd.merge(df_assm, df_sealer, on='Assembly')
-
-
-
- df.to_csv(output,sep='\t',index=False)
- with open(pathlist, 'w') as f:
- for item in file_list:
- f.write("%s\n" % item)
-
- return 'Summary of assembly outputs and a list of paths to the final mtGrasp output fasta files generated successfully! \n They can be found here \n Output tsv summary: %s \n Output file consisting of the full path(s) to the assembly output file(s): %s'%(output,pathlist)
-
-
-if __name__ == '__main__':
- print(summarize_outputs(input, output, pathlist))
+ assembly_list.append(dir)
+ file_list.append(fasta)
+
+
+df = pd.DataFrame({"Assembly": assembly_list,
+ "Ns per 1000bp": n_count_list,
+ "Number of Contigs": seq_count_list,
+ "Total Number of Base Pairs Per Assembly": num_bases_list,
+ "Length of the Longest Contig (bp)": max_seq_list,
+ "Circular or Linear": circle_check_list,
+ "Standardization Status": standardization_status_list
+ })
+
+df.to_csv(tsv_filename, sep="\t")
+
+with open(path_txt_filename, 'w') as f:
+ for path in file_list:
+ f.write(f"{path}\n")
+
+
+
diff --git a/run_ntjoin.sh b/run_ntjoin.sh
index 39a2ec7..8a380fe 100755
--- a/run_ntjoin.sh
+++ b/run_ntjoin.sh
@@ -1,3 +1,3 @@
#!/bin/bash
cd $1
-ntJoin assemble target=$2 target_weight=1 references=$3 t=$5 reference_weights='2' k=32 w=100 &> $4
+ntJoin assemble target=$2 target_weight=1 reference_config=$3 t=$5 k=32 w=100 &> $4