diff --git a/README.md b/README.md index 48dd0c7..9d4eae7 100755 --- a/README.md +++ b/README.md @@ -40,8 +40,25 @@ git clone https://github.com/bcgsc/mtGrasp.git * ntCard --- +### Installation Instructions for Dependencies +#### Step 1: -#### Special Installation Instructions for MITOS +Recommended (Faster): +``` +conda create -n mtgrasp python=3.10 mamba +conda activate mtgrasp +mamba install -c conda-forge -c bioconda snakemake 'blast>=2.10.0' biopython seqtk abyss ntjoin bwa samtools pilon ntcard +``` + +Alternative (Slower): +``` +conda create -n mtgrasp python=3.10 +conda activate mtgrasp +conda install -c conda-forge -c bioconda snakemake 'blast>=2.10.0' biopython seqtk abyss ntjoin bwa samtools pilon ntcard +``` + + +#### Step 2: Special Installation Instructions for MITOS As MITOS uses an older Python version, please install it in a new conda environment called "mitos" using the instructions below: @@ -121,9 +138,9 @@ However, if such sequences are unavailable, you can move up the taxonomic hierar `-b` or `--sealer_k=STRING`: k-mer size(s) used in sealer gap filling ['60,80,100,120'] -`-e` or `--end_recov_sealer_fpr=N`: False positive rate for the bloom filter used by sealer during flanking end recovery [0.01] +`-sf` or `--end_recov_sealer_fpr=N`: False positive rate for the bloom filter used by sealer during flanking end recovery [0.01] -`-v` or `--end_recov_sealer_k=STRING`: k-mer size used in sealer flanking end recovery ['60,80,100,120'] +`-sk` or `--end_recov_sealer_k=STRING`: k-mer size used in sealer flanking end recovery ['60,80,100,120'] `-i` or `--end_recov_p=N`: Merge at most N alternate paths during sealer flanking end recovery; use 'nolimit' for no limit [5] @@ -131,6 +148,7 @@ However, if such sequences are unavailable, you can move up the taxonomic hierar `-ma` or `--mismatch_allowed=N`: Maximum number of mismatches allowed while determining the overlapping region between the two ends of the mitochondrial assembly [1] +`-v` or `--version`: Print out the version of mtGrasp and exit --- @@ -193,28 +211,6 @@ If you are not interested in the standardized mitogenome sequence(s) or MITOS an -### Summarize mtGrasp results - -``` -mtgrasp_summarize.py -i -t -l -``` -Here, this script will summarize the mtGrasp results for all assembly output folders listed in the input text file ``. The output tsv file `` will contain the following columns: - -`Assembly`: the name of the assembly output folder along with the k and kc values used for the assembly - -`Number of sequences`: the number of mitochondrial sequences generated by mtGrasp - -`Scaffold lengths`: the length(s) of the mitochondrial sequence(s) generated by mtGrasp - -`Standardization status`: whether the mitochondrial sequence(s) generated by mtGrasp is standardized or not - -`Number of gaps (pre-GapFilling)`: the number of gaps in the mitochondrial sequence(s) generated by mtGrasp before gap filling - -`Number of gaps (post-GapFilling)`: the number of gaps in the mitochondrial sequence(s) generated by mtGrasp after gap filling - - -Finally, `` will contain the full path(s) to the assembly output fasta file(s) - --- ### Standardize your own mitochondrial sequence(s) using mtGrasp If you have your own mitochondrial sequence(s) and would like to standardize it/them using mtGrasp, you can do so by using mtGrasp's `mtgrasp_standardize.py` script. @@ -263,5 +259,30 @@ Because mtGrasp annotation uses a third-party tool called [MITOS](https://www.sc +--- + +### Summarize mtGrasp results + +``` +mtgrasp_summarize.py -i -p +``` + +Here, this script will summarize the mtGrasp results for all assembly output folders listed in the input text file ``. The output tsv file `{prefix}_mtgrasp_{mtgrasp_version}_assembly_summary.tsv'` will contain the following columns: + +`Assembly`: the name of the assembly output folder along with the k and kc values (number of read pairs if `-sub` is enabled) used for the assembly + +`Ns per 1000bp`: the number of Ns per 1000bp in the mitochondrial sequence(s) generated by mtGrasp + +`Number of Contigs`: the number of mitochondrial sequences generated by mtGrasp + +`Total Number of Base Pairs Per Assembly`: the total number of base pairs in the mitochondrial sequence(s) generated by mtGrasp + +`Length of the Lonest Contig (bp)`: the length of the longest mitochondrial sequence generated by mtGrasp + +`Circular or Linear`: whether the mitochondrial sequence(s) generated by mtGrasp is circular or linear + +`Standardization Status`: whether the mitochondrial sequence(s) generated by mtGrasp is start-site standardized and strand standardized + +Finally, `{prefix}_mtgrasp_{mtgrasp_version}_path_to_output.txt` will contain the complete path(s) to the assembly output fasta file(s) diff --git a/create_references_for_ntjoin.py b/create_references_for_ntjoin.py new file mode 100755 index 0000000..9da0754 --- /dev/null +++ b/create_references_for_ntjoin.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +# Split a fasta file into multiple files, one per sequence and create a reference config file for ntJoin +# Usage: split_fasta.py + +import sys + +ntjoin_reference_weight = 2 #Refer to ntJoin README for more info: https://github.com/bcgsc/ntJoin +input = sys.argv[1] +output = sys.argv[2] +config = sys.argv[3] +ref_list = [] +with open(input, 'r') as f: + for line in f: + line = line.strip() + if line.startswith('>'): + header = line + filename = header.split(' ')[0].strip('>') + '.fasta' + else: + sequence = line + with open(output + '/' + filename, 'w') as g: + g.write(header + '\n') + g.write(sequence + '\n') + ref_list.append(f'{output}/{filename},{ntjoin_reference_weight}') + +with open(config, 'w') as f: + f.write('\n'.join(ref_list)) + + diff --git a/mtgrasp.py b/mtgrasp.py index 2a4e3d3..94f9aac 100755 --- a/mtgrasp.py +++ b/mtgrasp.py @@ -4,6 +4,7 @@ import argparse import subprocess import shlex +mtgrasp_version = 'mtGrasp v1.0.0' # Make sure to edit the version for future releases parser = argparse.ArgumentParser(description='Usage of mtGrasp') parser.add_argument('-r1', '--read1', help='Forward read fastq.gz file', required=True) @@ -19,8 +20,8 @@ parser.add_argument('-s', '--sealer_fpr', help='False positive rate for the bloom filter used by sealer during gap filling [0.01]', default = 0.01) parser.add_argument('-p', '--gap_filling_p', help='Merge at most N alternate paths during sealer gap filling step [5]', default = 5) parser.add_argument('-b', '--sealer_k', help='k-mer size used in sealer gap filling [60,80,100,120]', default = '60,80,100,120') -parser.add_argument('-e', '--end_recov_sealer_fpr', help='False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]', default = 0.01) -parser.add_argument('-v', '--end_recov_sealer_k', help='k-mer size used in sealer flanking end recovery [60,80,100,120]', default = '60,80,100,120') +parser.add_argument('-sf', '--end_recov_sealer_fpr', help='False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]', default = 0.01) +parser.add_argument('-sk', '--end_recov_sealer_k', help='k-mer size used in sealer flanking end recovery [60,80,100,120]', default = '60,80,100,120') parser.add_argument('-i', '--end_recov_p', help='Merge at most N alternate paths during sealer flanking end recovery [5]', default = 5) parser.add_argument('-u', '--unlock', help='Remove a lock implemented by snakemake on the working directory', action='store_true') parser.add_argument('-ma', '--mismatch_allowed', help='Maximum number of mismatches allowed while determining the overlapping region between the two ends of the mitochondrial assembly [1]', default = 1) @@ -28,6 +29,7 @@ parser.add_argument('-nsub', '--nosubsample', help='Run mtGrasp using the entire read dataset without subsampling [False]', action='store_true') parser.add_argument('-an', '--annotate', help='Run gene annotation on the final assembly output [False]', action='store_true') parser.add_argument('-d', '--delete', help='Delete intermediate subdirectories/files once mtGrasp reaches completion [False]', action='store_true') +parser.add_argument('-v', '--version', action="version", version=mtgrasp_version) @@ -57,6 +59,7 @@ + # get the directory of the mtgrasp.smk script string = subprocess.check_output(['which', 'mtgrasp.smk']) string = string.decode('utf-8') @@ -96,6 +99,7 @@ {read1_base} {read2_base} {script_dir} {threads} {mt_gen} {kmer} \ {kc} {ref_path} {abyss_fpr} {sealer_fpr} {p} {sealer_k} \ {end_recov_sealer_fpr} {end_recov_p} {end_recov_sealer_k} {mismatch_allowed} 'No' ")) + else: print('Please double check mtGrasp usage information') exit(1) diff --git a/mtgrasp.smk b/mtgrasp.smk index 64de3d4..be4ed39 100755 --- a/mtgrasp.smk +++ b/mtgrasp.smk @@ -1,4 +1,7 @@ # Snakemake file for mtGrasp pipeline +# Make sure to edit the version for new releases +mtgrasp_version = 'v1.0.0' + import os.path import shlex @@ -19,7 +22,7 @@ else: # Start of the pipeline rule all: input: - expand(current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp-assembly.fa", library = config["out_dir"], k = config["k"], kc = config["kc"]) + expand(current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp_%s-assembly.fa"%(mtgrasp_version), library = config["out_dir"], k = config["k"], kc = config["kc"]) @@ -186,8 +189,8 @@ rule create_lists: input: rules.blast.output output: - ref_list=current_dir + "{library}/blast/k{k}_kc{kc}-query.txt", - query_list=current_dir + "{library}/blast/k{k}_kc{kc}-ref.txt" + ref_list=current_dir + "{library}/blast/k{k}_kc{kc}-ref.txt", + query_list=current_dir + "{library}/blast/k{k}_kc{kc}-query.txt" benchmark: current_dir + "{library}/benchmark/k{k}_kc{kc}.create_lists.benchmark.txt" shell: @@ -202,14 +205,18 @@ rule extract_seq: assemblies=rules.select_length.output output: query_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-scaffolds.blast-mt_db.fa", - ref_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-ref.fa" + ref_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-ref.fa", benchmark: current_dir + "{library}/benchmark/k{k}_kc{kc}.extract_seq.benchmark.txt" params: - ref_fasta=config["ref_path"] + ref_fasta=config["ref_path"], + ref_outdir=current_dir + "{library}/blast/mito_refs", + ref_config = current_dir + "{library}/blast/k{k}_kc{kc}-ntjoin_ref_config.csv" shell: "seqtk subseq {input.assemblies} {input.query} > {output.query_out} ; " - "seqtk subseq {params.ref_fasta} {input.ref} > {output.ref_out}" + "seqtk subseq {params.ref_fasta} {input.ref} > {output.ref_out} ;" + " mkdir -p {params.ref_outdir} && create_references_for_ntjoin.py {output.ref_out} {params.ref_outdir} {params.ref_config}" + #Prepare for polishing: ntJoin+Sealer or Sealer @@ -228,19 +235,18 @@ rule pre_polishing: sealer_fpr=config['sealer_fpr'], threads=config['threads'], p=config['p'], - k=config['sealer_k'] - benchmark: - current_dir + "{library}/benchmark/k{k}_kc{kc}.pre_polishing.benchmark.txt" - log: + k=config['sealer_k'], + ref_config = current_dir + "{library}/blast/k{k}_kc{kc}-ntjoin_ref_config.csv", sealer=current_dir + "{library}/prepolishing/k{k}_kc{kc}_sealer.log", ntjoin=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}_ntjoin.log" - + benchmark: + current_dir + "{library}/benchmark/k{k}_kc{kc}.pre_polishing.benchmark.txt" run: import os target = os.path.basename(input.target) ref = os.path.basename(input.ref) - log_ntjoin = os.path.basename(log.ntjoin) - log_sealer = log.sealer + log_ntjoin = os.path.basename(params.ntjoin) + log_sealer = params.sealer bf = bf_sealer(params.r1, params.r2, wildcards.library, params.threads, params.sealer_fpr,params.k) count = sum(1 for line in open(input[0])) k = k_string_converter(params.k) @@ -249,7 +255,7 @@ rule pre_polishing: if count == 0: print(f"Input file {input.target} is empty, no mitochondrial sequence found.") exit(1) - if num_gaps != 0 and count == 2: + elif num_gaps != 0 and count == 2: print("---Start Sealer Gap Filling---") print("One-piece contig found, no ntJoin scaffolding needed") shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {input.target} {params.r1} {params.r2} &> {log_sealer}""") @@ -261,9 +267,15 @@ rule pre_polishing: # If multiple contigs are found, both ntJoin and Sealer are needed else: - print("Multiple contigs found, ntJoin scaffolding and Gap-filling are both needed") - shell("""bash run_ntjoin.sh {params.workdir} {target} {ref} {log_ntjoin} {params.threads}""") - shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {params.ntjoin_out} {params.r1} {params.r2} &> {log_sealer}""") + print("Multiple contigs found, ntJoin scaffolding starts") + shell("""run_ntjoin.sh {params.workdir} {target} {params.ref_config} {log_ntjoin} {params.threads}""") + # check gaps need to be filled or not post-ntJoin + if check_gaps(params.ntjoin_out) == 0: + print("---No Gaps Found After ntJoin, Gap Filling Not Needed---") + shell("cp {params.ntjoin_out} {output}") + else: + print("---Gap Found After ntJoin, Start Sealer Gap Filling---") + shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {params.ntjoin_out} {params.r1} {params.r2} &> {log_sealer}""") @@ -348,7 +360,7 @@ rule standardization: input: rules.end_recovery.output output: - current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp-assembly.fa" + current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp_%s-assembly.fa"%(mtgrasp_version) benchmark: current_dir + "{library}/benchmark/k{k}_kc{kc}.standardization.benchmark.txt" params: diff --git a/mtgrasp_standardize.py b/mtgrasp_standardize.py index ce01398..ed654f8 100755 --- a/mtgrasp_standardize.py +++ b/mtgrasp_standardize.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# Make sure to edit the version for new releases +mtgrasp_version = 'v1.0.0' + import argparse import sys import shlex @@ -187,7 +190,7 @@ def run_mitos(env_name, file, code, dir, script_dir): if os.stat(file).st_size == 0: print('File is empty, no start-site standardization needed') # write an empty fasta file - with open('%s/%s.final-mtgrasp-assembly.fa'%(output_dir,sample), 'w') as f: + with open('%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir,sample, mtgrasp_version), 'w') as f: f.write('') @@ -197,7 +200,7 @@ def run_mitos(env_name, file, code, dir, script_dir): elif num_of_assemblies > 1 and scenario_in_file == False: print('Multiple contigs in the fasta file, standardization cannot be performed') # write the original fasta file to the output directory - with open('%s/%s.final-mtgrasp-assembly.fa'%(output_dir,sample), 'w') as f: + with open('%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir,sample, mtgrasp_version), 'w') as f: for record in SeqIO.parse(file, "fasta"): f.write('>Non-Standardized_%s_seq\n' % (sample) + str(record.seq) + '\n') # if the file has only one sequence or has 2 sequences (but 'Scenario' is found in file) @@ -271,7 +274,7 @@ def run_mitos(env_name, file, code, dir, script_dir): standardized_seq = remove_duplicates_in_a_list(standardized_seq) - file_name = '%s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample) + file_name = '%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version) # Store the results of the function and file reading in variables @@ -301,7 +304,7 @@ def run_mitos(env_name, file, code, dir, script_dir): # Run MitoS annotation on the final assembly fasta file if '-a' argument is provided if annotate: - print('Start Annotating %s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample)) + print('Start Annotating %s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version)) if not os.path.exists(f'{output_dir}/annotation_output'): cmd1 = f'mkdir -p {output_dir}/annotation_output' cmd_shlex1 = shlex.split(cmd1) @@ -310,7 +313,7 @@ def run_mitos(env_name, file, code, dir, script_dir): try: - output = run_mitos("mitos", '%s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample), mito_gencode,f'{output_dir}/annotation_output', script_dir) + output = run_mitos("mitos", '%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version), mito_gencode,f'{output_dir}/annotation_output', script_dir) print(f"Output: {output}") except Exception as e: print(f"Error: {e}") diff --git a/mtgrasp_summarize.py b/mtgrasp_summarize.py index f8c0cca..1350eba 100755 --- a/mtgrasp_summarize.py +++ b/mtgrasp_summarize.py @@ -1,120 +1,127 @@ #!/usr/bin/env python3 -# This script generates a summary of the assembly outputs (including the number of one-pieces, the maximum scaffold length in each assembly, and the assembly name) -# and writes the list of path to assembly output files to a text file. - - +''' +This script can be used to summarize mtGrasp assembly outputs by providing a text file containing the relative or complete path(s) to assembly output folder(s). +''' +mtgrasp_version = 'v1.0.0' # Make sure to edit the version for future releases import argparse import os import os.path -from Bio import SeqIO import pandas as pd import sys - parser = argparse.ArgumentParser() parser.add_argument('-i','--input',help='Input text file containing the path(s) to assembly output folder(s)',required=True) -parser.add_argument('-t','--output',help='Output tsv filename',required=True) -parser.add_argument('-l','--pathlist',help='Output file consisting of the full path(s) to the assembly output file(s)',required=True) +parser.add_argument('-p','--prefix',help='Prefix for output files',required=True) args = parser.parse_args() input = args.input -output = args.output -pathlist = args.pathlist - - - -def summarize_outputs(input, output, pathlist): - # report the sequence lengths of each scaffold in a fasta file - def count_seq_lengths(file): - lengths = [] - with open(file, "r") as f: - for line in f: - if line.startswith(">"): - continue - else: - lengths.append(len(line.strip())) - return ",".join([str(x) for x in lengths]) - -# report the standardization status of each assembly - def extract_headers(file): - headers = [] - with open(file, "r") as f: - for line in f: - if line.startswith(">"): - line = line.strip()[1:] #remove > - headers.append(line) - return ",".join(headers) - - - assembly = [] - number_of_sequences = [] - scaffold_lengths = [] - file_list =[] - gap_remaining = [] - pre_sealer_gap = [] - sealer_prefix = [] - status = [] - - file = open(input, 'r') - lines = file.read().splitlines() - - - for line in lines: - #summarize sealer log files - dir_sealer = "%s/prepolishing"%(line) - dir_sealer_exists=os.path.isdir(dir_sealer) - if dir_sealer_exists == True: - for f in os.listdir(dir_sealer): - f_basename = os.path.basename(f) - if f_basename.endswith("_log.txt") and os.stat("%s/%s"%(dir_sealer,f)).st_size != 0: - sealer_prefix.append("%s_%s"%(os.path.basename(line),'_'.join(f_basename.split('.')[0].split('_')[:-1]))) - with open("%s/%s"%(dir_sealer,f)) as file_in: - ls = [] - for l in file_in: - ls.append(l) - gap_remaining.append(int(float(ls[1].split(' ')[0]) - float(ls[-3].split(' ')[3]))) - pre_sealer_gap.append(int(float(ls[1].split(' ')[0]))) - else: - print("Error: %s does not exist"%(dir_sealer)) - continue - - - #summarize assembly outputs - directory = "%s/final_output"%(line) +prefix = args.prefix + +tsv_filename = f'{prefix}_mtgrasp_{mtgrasp_version}_assembly_summary.tsv' +path_txt_filename = f'{prefix}_mtgrasp_{mtgrasp_version}_path_to_output.txt' + + +# Count the number of contigs, Ns per 1000bp, and number of bases, length of longest contig, standardization status, etc. in an assembly output fasta +def get_assembly_metrics(fasta): + seq_count = 0 + num_bases = 0 + total_n_count = 0 #Ns per 1000bp + with open(fasta, 'r') as f: + seq_len_list = [] + for line in f: + line = line.strip() + if line.startswith('>'): + fasta_header = line + else: + seq = line.upper() + seq_len = len(seq) + seq_len_list.append(seq_len) + num_bases += seq_len + total_n_count += seq.count("N") # number of Ns per fasta sequence + seq_count += 1 + + if num_bases != 0: + n_count_per_1000 = total_n_count /num_bases/ 1000 + else: + n_count_per_1000 = 0 + max_seq_len = max(seq_len_list) + if seq_count == 0: + standardization_status = 'N/A' + circle_check = 'N/A' + elif seq_count == 1: + if "Circular" in fasta_header: + circle_check = 'Circular' + else: + circle_check = 'Linear' + + if "StartSite" in fasta_header and "Strand" in fasta_header: + standardization_status = "StartSite_Strand_Standardized" + elif "StartSite" in fasta_header and "Strand" not in fasta_header: + standardization_status = "StartSite_Standardized" + elif "StartSite" not in fasta_header and "Strand" not in fasta_header: + standardization_status = "StartSite_Standardized" + else: + standardization_status = "Non-Standardized" + else: + # Circularization and standardization are only conducted for one-piece assemblies to avoid potentially introducing misassemblies + circle_check = 'Linear' + standardization_status = "Non-Standardized" + + + return n_count_per_1000, seq_count, num_bases, max_seq_len, circle_check, standardization_status + +assembly_list, n_count_list, seq_count_list, num_bases_list, max_seq_list, circle_check_list, standardization_status_list, file_list = [], [], [], [], [], [], [], [] + +with open(input, 'r') as dirs: + for outdir in dirs: + outdir = outdir.strip() + if not os.path.exists(outdir): + print(f"Error: mtGrasp output directory {outdir} does not exist") + directory = "%s/final_output"%(outdir) dir_exists=os.path.exists(directory) - if dir_exists == True: + + if dir_exists: for dir in os.listdir(directory): - fasta = os.path.abspath("%s/%s/%s.final-mtgrasp-assembly.fa"%(directory,dir,dir)) - file_list.append(fasta) + + fasta = os.path.abspath("%s/%s/%s.final-mtgrasp_%s-assembly.fa"%(directory,dir,dir, mtgrasp_version)) # Complete path to output fasta file + # check if fasta output file exists + if os.path.exists(fasta) and any(line.startswith('>') for line in open(fasta)): + n_count_list.append(get_assembly_metrics(fasta)[0]) + seq_count_list.append(get_assembly_metrics(fasta)[1]) + num_bases_list.append(get_assembly_metrics(fasta)[2]) + max_seq_list.append(get_assembly_metrics(fasta)[3]) + circle_check_list.append(get_assembly_metrics(fasta)[4]) + standardization_status_list.append(get_assembly_metrics(fasta)[5]) + else: + n_count_list.append(0) + seq_count_list.append(0) + num_bases_list.append(0) + max_seq_list.append(0) + circle_check_list.append('N/A') + standardization_status_list.append('N/A') + - assembly.append(dir) - n = len([1 for l in open(fasta) if l.startswith(">")]) - number_of_sequences.append(n) - scaffold_lengths.append(count_seq_lengths(fasta)) - status.append(extract_headers(fasta)) - else: - print("Error: %s does not exist"%(directory)) - continue - - - - df_assm = pd.DataFrame({'Assembly':assembly, 'Number of sequences':number_of_sequences, 'Scaffold lengths':scaffold_lengths, 'Standardization status':status}) - df_sealer = pd.DataFrame({'Assembly':sealer_prefix, 'Number of gaps (pre-GapFilling)':pre_sealer_gap, 'Number of gaps (post-GapFilling)':gap_remaining}) - - df = pd.merge(df_assm, df_sealer, on='Assembly') - - - - df.to_csv(output,sep='\t',index=False) - with open(pathlist, 'w') as f: - for item in file_list: - f.write("%s\n" % item) - - return 'Summary of assembly outputs and a list of paths to the final mtGrasp output fasta files generated successfully! \n They can be found here \n Output tsv summary: %s \n Output file consisting of the full path(s) to the assembly output file(s): %s'%(output,pathlist) - - -if __name__ == '__main__': - print(summarize_outputs(input, output, pathlist)) + assembly_list.append(dir) + file_list.append(fasta) + + +df = pd.DataFrame({"Assembly": assembly_list, + "Ns per 1000bp": n_count_list, + "Number of Contigs": seq_count_list, + "Total Number of Base Pairs Per Assembly": num_bases_list, + "Length of the Longest Contig (bp)": max_seq_list, + "Circular or Linear": circle_check_list, + "Standardization Status": standardization_status_list + }) + +df.to_csv(tsv_filename, sep="\t") + +with open(path_txt_filename, 'w') as f: + for path in file_list: + f.write(f"{path}\n") + + + diff --git a/run_ntjoin.sh b/run_ntjoin.sh index 39a2ec7..8a380fe 100755 --- a/run_ntjoin.sh +++ b/run_ntjoin.sh @@ -1,3 +1,3 @@ #!/bin/bash cd $1 -ntJoin assemble target=$2 target_weight=1 references=$3 t=$5 reference_weights='2' k=32 w=100 &> $4 +ntJoin assemble target=$2 target_weight=1 reference_config=$3 t=$5 k=32 w=100 &> $4