diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e29e68c5..f05ceb88 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,3 +77,34 @@ jobs: - name: Run pipeline with test data (BGC workflow) run: | nextflow run ${GITHUB_WORKSPACE} -profile test_bgc,docker --outdir ./results ${{ matrix.parameters }} --bgc_skip_deepbgc + + test_taxonomy: + name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows) + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "23.04.0" + - "latest-everything" + parameters: + - "--annotation_tool prodigal" + - "--annotation_tool prokka" + - "--annotation_tool bakta --annotation_bakta_db_downloadtype light" + + steps: + - name: Check out pipeline code + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Disk space cleanup + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows) + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_taxonomy,docker --outdir ./results ${{ matrix.parameters }} diff --git a/CHANGELOG.md b/CHANGELOG.md index e809b5d2..41f396ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#324](https://github.com/nf-core/funcscan/pull/324) Removed separate DeepARG test profile because database download is now stable. (by @jasmezz) - [#332](https://github.com/nf-core/funcscan/pull/332) & [#327](https://github.com/nf-core/funcscan/pull/327) Merged pipeline template of nf-core/tools version 2.12.1 (by @jfy133, @jasmezz) - [#338](https://github.com/nf-core/funcscan/pull/338) Set `--meta` parameter to default for Bakta, with singlemode optional. (by @jasmezz) +- [#343](https://github.com/nf-core/funcscan/pull/343) Added contig taxonomic classification using [MMseqs2](https://github.com/soedinglab/MMseqs2/). (by @darcy220606) ### `Fixed` -- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. (by @darcy220606) - -### `Dependencies` +- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with 'sample_id\tcontig_id\t..'. Reformatted the output of `hamronization/summarize` module. (by @darcy220606) +- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore`. (by @darcy220606) | Tool | Previous version | New version | | ------------- | ---------------- | ----------- | diff --git a/CITATIONS.md b/CITATIONS.md index f7e0134a..a605e038 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -90,6 +90,10 @@ > Alcock, B. P., Huynh, W., Chalil, R., Smith, K. W., Raphenya, A. R., Wlodarski, M. A., Edalatmand, A., Petkau, A., Syed, S. A., Tsang, K. K., Baker, S. J. C., Dave, M., McCarthy, M. C., Mukiri, K. M., Nasir, J. A., Golbon, B., Imtiaz, H., Jiang, X., Kaur, K., Kwong, M., Liang, Z. C., Niu, K. C., Shan, P., Yang, J. Y. J., Gray, K. L., Hoad, G. R., Jia, B., Bhando, T., Carfrae, L. A., Farha, M. A., French, S., Gordzevich, R., Rachwalski, K., Tu, M. M., Bordeleau, E., Dooley, D., Griffiths, E., Zubyk, H. L., Brown, E. D., Maguire, F., Beiko, R. G., Hsiao, W. W. L., Brinkman F. S. L., Van Domselaar, G., McArthur, A. G. (2023). CARD 2023: expanded curation, support for machine learning, and resistome prediction at the Comprehensive Antibiotic Resistance Database. Nucleic acids research, 51(D1):D690-D699. [DOI: 10.1093/nar/gkac920](https://doi.org/10.1093/nar/gkac920) +- [MMseqs2](https://doi.org/10.1093bioinformatics/btab184) + + > Mirdita, M., Steinegger, M., Breitwieser, F., Söding, J., Levy Karin, E. (2021). Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics, 37(18),3029–3031. [DOI: 10.1093/bioinformatics/btab184](https://doi.org/10.1093/bioinformatics/btab184) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 59d100d9..d56dbc47 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,13 @@ The nf-core/funcscan AWS full test dataset are contigs generated by the MGnify s ## Pipeline summary -1. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta) -2. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) -3. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg) -4. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/) -5. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs -6. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) +1. Taxonomic classification of contigs of **prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2) +2. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta) +3. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) +4. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg) +5. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/) +6. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs +7. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) ![funcscan metro workflow](docs/images/funcscan_metro_workflow.png) diff --git a/bin/comBGC.py b/bin/comBGC.py index a492af97..3afd6aec 100755 --- a/bin/comBGC.py +++ b/bin/comBGC.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Jasmin Frangenberg and released under the MIT license. +# See below for full license text. + from Bio import SeqIO import pandas as pd import argparse @@ -643,6 +646,10 @@ def gecco_workflow(gecco_paths): inplace=True, ) + # Rearrange and rename the columns in the summary df + summary_all = summary_all.iloc[:, [0, 2, 1] + list(range(3, len(summary_all.columns)))] + summary_all.rename(columns={'Sample_ID':'sample_id', 'Contig_ID':'contig_id', 'CDS_ID':'BGC_region_contig_ids'}, inplace=True) + # Write results to TSV if not os.path.exists(outdir): os.makedirs(outdir) diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py new file mode 100755 index 00000000..14ea73a1 --- /dev/null +++ b/bin/merge_taxonomy.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +# Written by Anan Ibrahim and released under the MIT license. +# See git repository (https://github.com/Darcy220606/AMPcombi) for full license text. +# Date: March 2024 +# Version: 0.1.0 + +# Required modules +import sys +import os +import pandas as pd +import numpy as np +import argparse + +tool_version = "0.1.0" +######################################### +# TOP LEVEL: AMPCOMBI +######################################### +parser = argparse.ArgumentParser(prog = 'merge_taxonomy', formatter_class=argparse.RawDescriptionHelpFormatter, + usage='%(prog)s [options]', + description=('''\ + ............................................................................. + *merge_taxonomy* + ............................................................................. + This script merges all three funcscan workflows with + MMseqs2 taxonomy results. This is done in three submodules that can be + activated seperately. + .............................................................................'''), + epilog='''Thank you for running taxonomy_merge!''', + add_help=True) +parser.add_argument('--version', action='version', version='merge_taxonomy ' + tool_version) + +######################################### +# SUBPARSERS +######################################### +subparsers = parser.add_subparsers(required=True) + +######################################### +# SUBPARSER: AMPCOMBI +######################################### +ampcombi_parser = subparsers.add_parser('ampcombi_taxa') + +ampcombi_parser.add_argument("--ampcombi", dest="amp", nargs='?', help="Enter the path to the ampcombi_complete_summary.tsv' \n (default: %(default)s)", + type=str, default='ampcombi_complete_summary.csv') +ampcombi_parser.add_argument("--taxonomy", dest="taxa1", nargs='+', help="Enter the list of taxonomy files for all samples. ") + +######################################### +# SUBPARSER: COMBGC +######################################### +combgc_parser = subparsers.add_parser('combgc_taxa') + +combgc_parser.add_argument("--combgc", dest="bgc", nargs='?', help="Enter the path to the combgc_complete_summary.tsv' \n (default: %(default)s)", + type=str, default='combgc_complete_summary.csv') +combgc_parser.add_argument("--taxonomy", dest="taxa2", nargs='+', help="Enter the list of taxonomy files for all samples. ") + +######################################### +# SUBPARSER: HAMRONIZATION +######################################### +hamronization_parser = subparsers.add_parser('hamronization_taxa') + +hamronization_parser.add_argument("--hamronization", dest="arg", nargs='?', help="Enter the path to the hamronization_complete_summary.tsv' \n (default: %(default)s)", + type=str, default='hamronization_complete_summary.csv') +hamronization_parser.add_argument("--taxonomy", dest="taxa3",nargs='+', help="Enter the list of taxonomy files for all samples. ") + +######################################### +# TAXONOMY +######################################### +def reformat_mmseqs_taxonomy(mmseqs_taxonomy): + mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig']) + # remove the lineage column + mmseqs2_df.drop('lineage', axis=1, inplace=True) + mmseqs2_df['mmseqs_lineage_contig'].unique() + # convert any classification that has Eukaryota/root to NaN as funcscan targets bacteria ONLY ** + for i, row in mmseqs2_df.iterrows(): + lineage = str(row['mmseqs_lineage_contig']) + if 'Eukaryota' in lineage or 'root' in lineage: + mmseqs2_df.at[i, 'mmseqs_lineage_contig'] = np.nan + # insert the sample name in the first column according to the file basename + file_basename = os.path.basename(mmseqs_taxonomy) + filename = os.path.splitext(file_basename)[0] + mmseqs2_df.insert(0, 'sample_id', filename) + return mmseqs2_df + +######################################### +# FUNCTION: AMPCOMBI +######################################### +def ampcombi_taxa(args): + merged_df = pd.DataFrame() + + # assign input args to variables + ampcombi = args.amp + taxa_list = args.taxa1 + + # prepare the taxonomy files + taxa_df = pd.DataFrame() + # append the dfs to the taxonomy_files_combined + for file in taxa_list: # list of taxa files ['',''] + df = reformat_mmseqs_taxonomy(file) + taxa_df = pd.concat([taxa_df, df]) + + # filter the tool df + tool_df = pd.read_csv(ampcombi, sep=',') #current ampcombi version is comma sep. CHANGE WITH VERSION 0.2.0 + # make sure 1st and 2nd column have the same column labels + tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True) + tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True) + # grab the real contig id in another column copy for merging + tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0] + + # merge rows from taxa to ampcombi_df based on substring match in sample_id + # grab the unique sample names from the taxonomy table + samples_taxa = taxa_df['sample_id'].unique() + # for every sampleID in taxadf merge the results + for sampleID in samples_taxa: + # subset ampcombi + subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset taxa + subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + # merge + subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left') + # cleanup the table + columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y'] + subset_df.drop(columnsremove, axis=1, inplace=True) + subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True) + # append in the combined_df + merged_df = merged_df.append(subset_df, ignore_index=True) + + # write to file + merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False) + +######################################### +# FUNCTION: COMBGC +######################################### +def combgc_taxa(args): + merged_df = pd.DataFrame() + + # assign input args to variables + combgc = args.bgc + taxa_list = args.taxa2 + + # prepare the taxonomy files + taxa_df = pd.DataFrame() + # append the dfs to the taxonomy_files_combined + for file in taxa_list: # list of taxa files ['',''] + df = reformat_mmseqs_taxonomy(file) + taxa_df = pd.concat([taxa_df, df]) + + # filter the tool df + tool_df = pd.read_csv(combgc, sep='\t') + # make sure 1st and 2nd column have the same column labels + tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True) + tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True) + + # merge rows from taxa to ampcombi_df based on substring match in sample_id + # grab the unique sample names from the taxonomy table + samples_taxa = taxa_df['sample_id'].unique() + # for every sampleID in taxadf merge the results + for sampleID in samples_taxa: + # subset ampcombi + subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset taxa + subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + # merge + subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id', right_on='contig_id', how='left') + # cleanup the table + columnsremove = ['sample_id_y'] + subset_df.drop(columnsremove, axis=1, inplace=True) + subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True) + # append in the combined_df + merged_df = merged_df.append(subset_df, ignore_index=True) + + # write to file + merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False) + +######################################### +# FUNCTION: HAMRONIZATION +######################################### +def hamronization_taxa(args): + merged_df = pd.DataFrame() + + # assign input args to variables + hamronization = args.arg + taxa_list = args.taxa3 + + # prepare the taxonomy files + taxa_df = pd.DataFrame() + # append the dfs to the taxonomy_files_combined + for file in taxa_list: # list of taxa files ['',''] + df = reformat_mmseqs_taxonomy(file) + taxa_df = pd.concat([taxa_df, df]) + + # filter the tool df + tool_df = pd.read_csv(hamronization, sep='\t') + # rename the columns + tool_df.rename(columns={'input_file_name':'sample_id', 'input_sequence_id':'contig_id'}, inplace=True) + # reorder the columns + new_order = ['sample_id', 'contig_id'] + [col for col in tool_df.columns if col not in ['sample_id', 'contig_id']] + tool_df = tool_df.reindex(columns=new_order) + # grab the real contig id in another column copy for merging + tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0] + + # merge rows from taxa to ampcombi_df based on substring match in sample_id + # grab the unique sample names from the taxonomy table + samples_taxa = taxa_df['sample_id'].unique() + # for every sampleID in taxadf merge the results + for sampleID in samples_taxa: + # subset ampcombi + subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset taxa + subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + # merge + subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left') + # cleanup the table + columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y'] + subset_df.drop(columnsremove, axis=1, inplace=True) + subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True) + # append in the combined_df + merged_df = merged_df.append(subset_df, ignore_index=True) + + # write to file + merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False) + +######################################### +# SUBPARSERS: DEFAULT +######################################### +ampcombi_parser.set_defaults(func=ampcombi_taxa) +combgc_parser.set_defaults(func=combgc_taxa) +hamronization_parser.set_defaults(func=hamronization_taxa) + +if __name__ == '__main__': + args = parser.parse_args() + args.func(args) # call the default function diff --git a/conf/modules.config b/conf/modules.config index 36400fe9..eb27c5a1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -44,6 +44,56 @@ process { ext.prefix = { "${meta.id}.fa" } } + withName: MMSEQS_DATABASES { + publishDir = [ + path: { "${params.outdir}/databases/mmseqs/" }, + mode: params.publish_dir_mode, + enabled: params.save_databases, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = [ + params.taxa_classification_mmseqs_databases_savetmp ? "" : "--remove-tmp-files" , + ].join(' ').trim() + } + + withName: MMSEQS_CREATEDB { + publishDir = [ + path: { "${params.outdir}/databases/mmseqs/mmseqs_createdb/" }, + mode: params.publish_dir_mode, + enabled: params.save_databases, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: MMSEQS_TAXONOMY { + publishDir = [ + path: { "${params.outdir}/databases/mmseqs/mmseqs_taxonomy/" }, + mode: params.publish_dir_mode, + enabled: params.save_databases, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = [ + params.taxa_classification_mmseqs_taxonomy_savetmp ? "" : "--remove-tmp-files", + "--search-type ${params.taxa_classification_mmseqs_taxonomy_searchtype}", + "--lca-ranks ${params.taxa_classification_mmseqs_taxonomy_lcaranks}", + "--tax-lineage ${params.taxa_classification_mmseqs_taxonomy_taxlineage}", + "-s ${params.taxa_classification_mmseqs_taxonomy_sensitivity}", + "--orf-filter-s ${params.taxa_classification_mmseqs_taxonomy_orffilters}", + "--lca-mode ${params.taxa_classification_mmseqs_taxonomy_lcamode}", + "--vote-mode ${params.taxa_classification_mmseqs_taxonomy_votemode}", + ].join(' ').trim() + } + + withName: MMSEQS_CREATETSV { + publishDir = [ + path: { "${params.outdir}/taxa_classification/mmseqs_createtsv/${meta.id}/" }, + mode: params.publish_dir_mode, + enabled: params.run_taxa_classification, + pattern: "*.tsv", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: PROKKA { publishDir = [ path: { "${params.outdir}/annotation/prokka/" }, @@ -459,6 +509,22 @@ process { } withName: HAMRONIZATION_SUMMARIZE { + publishDir = [ + path: { "${params.outdir}/reports/hamronization_summarize" }, + mode: params.publish_dir_mode, + saveAs: { (params.run_taxa_classification == false) ? it : null } + ] + } + + withName: MERGE_TAXONOMY_HAMRONIZATION { + publishDir = [ + path: { "${params.outdir}/reports/hamronization_summarize" }, + mode: params.publish_dir_mode, + saveAs: { _ -> null } // do not save the file + ] + } + + withName: ARG_TABIX_BGZIP { publishDir = [ path: { "${params.outdir}/reports/hamronization_summarize" }, mode: params.publish_dir_mode, @@ -477,7 +543,15 @@ process { ext.args = "--tooldict '${ext.tooldict}' --cutoff ${params.amp_ampcombi_cutoff}" } - withName: TABIX_BGZIP { + withName: MERGE_TAXONOMY_AMPCOMBI { + publishDir = [ + path: { "${params.outdir}/reports/ampcombi" }, + mode: params.publish_dir_mode, + saveAs: { _ -> null } // do not save the file + ] + } + + withName: AMP_TABIX_BGZIP { publishDir = [ path: { "${params.outdir}/reports/ampcombi" }, mode: params.publish_dir_mode, @@ -493,6 +567,22 @@ process { ] } + withName: MERGE_TAXONOMY_COMBGC { + publishDir = [ + path: { "${params.outdir}/reports/combgc" }, + mode: params.publish_dir_mode, + saveAs: { _ -> null } // do not save the file + ] + } + + withName: BGC_TABIX_BGZIP { + publishDir = [ + path: { "${params.outdir}/reports/combgc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: DRAMP_DOWNLOAD { publishDir = [ path: { "${params.outdir}/databases/dramp" }, diff --git a/conf/test.config b/conf/test.config index 79fd38be..9e95a491 100644 --- a/conf/test.config +++ b/conf/test.config @@ -16,17 +16,17 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '6.GB' + max_memory = '8.GB' max_time = '6.h' // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'prodigal' - run_arg_screening = true - arg_fargene_hmmmodel = 'class_a,class_b_1_2' + run_arg_screening = true + arg_fargene_hmmmodel = 'class_a,class_b_1_2' - run_amp_screening = true + run_amp_screening = true } diff --git a/conf/test_bgc.config b/conf/test_bgc.config index 17df755d..89228579 100644 --- a/conf/test_bgc.config +++ b/conf/test_bgc.config @@ -16,7 +16,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '6.GB' + max_memory = '8.GB' max_time = '6.h' // Input data diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 5f850139..34fdd49a 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -18,7 +18,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '6.GB' + max_memory = '8.GB' max_time = '6.h' // Input data diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config new file mode 100644 index 00000000..ad477b3c --- /dev/null +++ b/conf/test_taxonomy.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test_taxonomy, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Taxonomic classification test profile' + config_profile_description = 'Minimal test dataset to check taxonomic classification workflow function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '8.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' + bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' + amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' + + run_taxa_classification = true + annotation_tool = 'prodigal' + + run_arg_screening = true + arg_skip_deeparg = true + arg_skip_amrfinderplus = true + + run_amp_screening = true + + run_bgc_screening = true + bgc_skip_deepbgc = true +} diff --git a/docs/output.md b/docs/output.md index 064c073d..f20d1cd2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,9 +8,9 @@ The output of nf-core/funcscan provides reports for each of the functional group - antimicrobial peptides (tools: [Macrel](https://github.com/BigDataBiology/macrel), [AMPlify](https://github.com/bcgsc/AMPlify), [ampir](https://ampir.marine-omics.net), [hmmsearch](http://hmmer.org) – summarised by [AMPcombi](https://github.com/Darcy220606/AMPcombi)) - biosynthetic gene clusters (tools: [antiSMASH](https://docs.antismash.secondarymetabolites.org), [DeepBGC](https://github.com/Merck/deepbgc), [GECCO](https://gecco.embl.de), [hmmsearch](http://hmmer.org) – summarised by [comBGC](#combgc)) -As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. +As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `--taxa_classification_mmseqs_databases_savetmp` and `--taxa_classification_mmseqs_taxonomy_savetmp` flags are set. -Similarly, all downloaded databases are saved (i.e. from [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [AMPcombi](https://github.com/Darcy220606/AMPcombi), and [RGI](https://github.com/arpcard/rgi)) into the output directory `/databases/` if the `--save_databases` flag was set. +Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [RGI](https://github.com/arpcard/rgi), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/databases/` if the `--save_databases` flag was set. Furthermore, for reproducibility, versions of all software used in the run is presented in a [MultiQC](http://multiqc.info) report. @@ -18,6 +18,8 @@ The directories listed below will be created in the results directory (specified ```console results/ +├── taxonomic_classification/ +| └── mmseqs_createtsv/ ├── annotation/ | ├── bakta/ | ├── prodigal @@ -54,6 +56,10 @@ work/ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes prokaryotic sequence data through the following steps: +Taxonomy classification of nucleotide sequences with: + +- [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) - for contig taxonomic classification using 2bLCA. + ORF prediction and annotation with any of: - [Pyrodigal](#pyrodigal) (default) – for open reading frame prediction. @@ -93,10 +99,25 @@ Output Summaries: ## Tool details +### Taxonomic classification tool + +
+Output files + +- `taxonomic_classification/mmseqs2_createtsv/` + - `/`: + - `*.tsv`: tab-separated table containing the taxonomic lineage of every contig. When a contig cannot be classified according to the database, it is assigned in the 'lineage' column as 'no rank | unclassified'. +- `reports//_complete_summary_taxonomy.tsv.gz`: tab-separated table containing the concatenated results from the summary tables along with the taxonomic classification if the parameter `--run_taxa_classification` is called. +
+ +[MMseqs2](https://github.com/soedinglab/MMseqs2) classifies the taxonomic lineage of contigs based on the last common ancestor. The inferred taxonomic lineages are included in the final workflow summaries to annotate the potential source bacteria of the identified AMPs, ARGs, and/or BGCs. + ### Annotation tools [Pyrodigal](#pyrodigal), [Prodigal](#prodigal), [Prokka](#prokka), [Bakta](#bakta) +### Annotation tools + #### Prodigal
@@ -403,16 +424,13 @@ Output Summaries: Output files - `ampcombi/` - - `ampcombi_complete_summary.csv.gz`: summarised output from all AMP workflow tools (except hmmer_hmmsearch) in compressed csv format + - `ampcombi_complete_summary.tsv`: tab-separated table containing the concatenated results from the AMPcombi summary tables. This is the output given when the taxonomic classification is not activated (pipeline default). + - `ampcombi_complete_summary_taxonomy.tsv.gz`: summarised output from all AMP workflow tools with taxonomic assignment in compressed tsv format. - `ampcombi.log`: a log file generated by ampcombi - - `*_ampcombi.csv`: summarised output in csv for each sample - - `*_amp.faa*`: fasta file containing the amino acid sequences for all AMP hits for each sample - - `*_diamond_matches.txt*`: alignment file generated by DIAMOND for each sample - -
- -
-AMP summary table header descriptions + - `/*_ampcombi.csv`: summarised output in csv for each sample + - `/*_amp.faa*`: fasta file containing the amino acid sequences for all AMP hits for each sample + - `/*_diamond_matches.txt*`: alignment file generated by DIAMOND for each sample + AMP summary table header descriptions | Table column | Description | | ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -463,9 +481,10 @@ Output Summaries:
Output files -- `hamronization/` one of the following: +- `hamronization_summarize/` one of the following: - `hamronization_combined_report.json`: summarised output in .json format - - `hamronization_combined_report.tsv`: summarised output in .tsv format + - `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default). + - `hamronization_combined_report.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`. - `hamronization_combined_report.html`: interactive output in .html format
@@ -521,7 +540,8 @@ Output Summaries: Output files - `comBGC/` - - `combgc_complete_summary.tsv`: summarised output from all BGC detection tools used in tsv format (all samples concatenated). + - `combgc_complete_summary.tsv`: summarised output from all BGC detection tools used in tsv format (all samples concatenated). This is the output given when the taxonomic classification is not activated (pipeline default). + - `combgc_complete_summary.tsv.gz`: summarised output in gzipped format from all BGC detection tools used in tsv format (all samples concatenated) along with the taxonomic classification obtained when `--run_taxa_classification` is activated. - `*/combgc_summary.tsv`: summarised output from all applied BGC detection tools in tsv format for each sample.
diff --git a/docs/usage.md b/docs/usage.md index b5fb3dd7..27afb2ea 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,7 +6,7 @@ ## Introduction -nf-core/funcscan is a pipeline for efficient and parallelised screening of long nucleotide sequences such as contigs for antimicrobial peptide genes, antimicrobial resistance genes, and biosynthetic gene clusters. +nf-core/funcscan is a pipeline for efficient and parallelised screening of long nucleotide sequences such as contigs for antimicrobial peptide genes, antimicrobial resistance genes, and biosynthetic gene clusters. It can additionally identify the taxonomic origin of the sequences. ## Running the pipeline @@ -18,13 +18,14 @@ nextflow run nf-core/funcscan --input samplesheet.csv --outdir -profile This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. -To run any of the three screening workflows (AMP, ARG, and/or BGC), switch them on by adding the respective flag(s) to the command: +To run any of the three screening workflows (AMP, ARG, and/or BGC) or taxonomic classification, switch them on by adding the respective flag(s) to the command: - `--run_amp_screening` - `--run_arg_screening` - `--run_bgc_screening` +- `--run_taxa_classification` -When switched on, all tools of the given workflow will be run by default. If you don't need specific tools, you can explicitly skip them. +When switched on, all tools of the given workflow will be run by default. If you don't need specific tools, you can explicitly skip them. For the taxonomic classification, MMseqs2 is currently the only tool implemented in the pipline. **Example:** You want to run AMP and ARG screening but you don't need the DeepARG tool of the ARG workflow and the Macrel tool of the AMP workflow. Your command would be: @@ -68,10 +69,26 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p > ⚠️ We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters. -## Notes on screening tools +## Notes on screening tools and taxonomic classification The implementation of some tools in the pipeline may have some particular behaviours that you should be aware of before you run the pipeline. +### MMseqs2 + +MMseqs2 is currently the only taxonomic classification tool used in the pipeline to assign a taxonomic lineage to the input contigs. The database used to assign the taxonomic lineage can either be: + +- a custom based database created by the user using `mmseqs createdb` externally and beforehand. If this flag is assigned, this database takes precedence over the default database in ` mmseqs_databases_id`. + + ```bash + --taxa_classification_mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' + ``` + +- an MMseqs2 ready database. These databases were compiled by the developers of MMseqs2 and can be called using their labels. All available options can be found [here](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). Only use those databases that have taxonomy files available (i.e., Taxonomy == Yes). By default mmseqs2 in the pipeline uses '[Kalamari](https://github.com/lskatz/Kalamari)' and runs an aminoacid based alignment. + + ```bash + --taxa_classification_mmseqs_databases_id 'Kalamari' + ``` + ### antiSMASH antiSMASH has a minimum contig parameter, in which only contigs of a certain length (or longer) will be screened. In cases where no hits are found in these, the tool ends successfully without hits. However if no contigs in an input file reach that minimum threshold, the tool will end with a 'failure' code, and cause the pipeline to crash. diff --git a/modules.json b/modules.json index 425884f2..b0e54968 100644 --- a/modules.json +++ b/modules.json @@ -136,6 +136,26 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, + "mmseqs/createdb": { + "branch": "master", + "git_sha": "18a43d316b6fd683dc2346867b42882b99811cfd", + "installed_by": ["modules"] + }, + "mmseqs/createtsv": { + "branch": "master", + "git_sha": "5d849d54f06174c3313eb50c776d4916912db16b", + "installed_by": ["modules"] + }, + "mmseqs/databases": { + "branch": "master", + "git_sha": "151460db852d636979d9ff3ee631e2268060d4c3", + "installed_by": ["modules"] + }, + "mmseqs/taxonomy": { + "branch": "master", + "git_sha": "8455be677998258bf40ab3be550c6a96f456cc23", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", diff --git a/modules/local/merge_taxonomy_ampcombi.nf b/modules/local/merge_taxonomy_ampcombi.nf new file mode 100644 index 00000000..26e38343 --- /dev/null +++ b/modules/local/merge_taxonomy_ampcombi.nf @@ -0,0 +1,32 @@ +process MERGE_TAXONOMY_AMPCOMBI { + label 'process_medium' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.80 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' : + 'biocontainers/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' }" + + input: + path(ampcombi_df) + path(taxa_list) + + output: + path "ampcombi_complete_summary_taxonomy.tsv" , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/funcscan/bin/ + """ + merge_taxonomy.py \\ + ampcombi_taxa \\ + --ampcombi $ampcombi_df \\ + --taxonomy $taxa_list + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + merge_taxonomy: \$(merge_taxonomy.py --version | sed 's/merge_taxonomy //g') + END_VERSIONS + """ +} diff --git a/modules/local/merge_taxonomy_combgc.nf b/modules/local/merge_taxonomy_combgc.nf new file mode 100644 index 00000000..075668f2 --- /dev/null +++ b/modules/local/merge_taxonomy_combgc.nf @@ -0,0 +1,32 @@ +process MERGE_TAXONOMY_COMBGC { + label 'process_medium' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.80 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' : + 'biocontainers/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' }" + + input: + path(combgc_df) + path(taxa_list) + + output: + path "combgc_complete_summary_taxonomy.tsv" , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/funcscan/bin/ + """ + merge_taxonomy.py \\ + combgc_taxa \\ + --combgc $combgc_df \\ + --taxonomy $taxa_list + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + merge_taxonomy: \$(merge_taxonomy.py --version | sed 's/merge_taxonomy //g') + END_VERSIONS + """ +} diff --git a/modules/local/merge_taxonomy_hamronization.nf b/modules/local/merge_taxonomy_hamronization.nf new file mode 100644 index 00000000..14b85ff2 --- /dev/null +++ b/modules/local/merge_taxonomy_hamronization.nf @@ -0,0 +1,32 @@ +process MERGE_TAXONOMY_HAMRONIZATION { + label 'process_medium' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.80 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' : + 'biocontainers/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' }" + + input: + path(hamronization_df) + path(taxa_list) + + output: + path "hamronization_complete_summary_taxonomy.tsv" , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/funcscan/bin/ + """ + merge_taxonomy.py \\ + hamronization_taxa \\ + --hamronization $hamronization_df \\ + --taxonomy $taxa_list + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + merge_taxonomy: \$(merge_taxonomy.py --version | sed 's/merge_taxonomy //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/createdb/environment.yml b/modules/nf-core/mmseqs/createdb/environment.yml new file mode 100644 index 00000000..77b28f59 --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/environment.yml @@ -0,0 +1,7 @@ +name: mmseqs_createdb +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::mmseqs2=15.6f452 diff --git a/modules/nf-core/mmseqs/createdb/main.nf b/modules/nf-core/mmseqs/createdb/main.nf new file mode 100644 index 00000000..9487e5bc --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/main.nf @@ -0,0 +1,65 @@ +process MMSEQS_CREATEDB { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': + 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + + input: + tuple val(meta), path(sequence) + + output: + tuple val(meta), path("${prefix}/"), emit: db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = sequence.getExtension() == "gz" ? true : false + def sequence_name = is_compressed ? sequence.getBaseName() : sequence + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${sequence} > ${sequence_name} + fi + + mkdir -p ${prefix} + + mmseqs \\ + createdb \\ + ${sequence_name} \\ + ${prefix}/${prefix} \\ + $args \\ + --compressed 1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p ${prefix} + + touch ${prefix}/${prefix} + touch ${prefix}/${prefix}.dbtype + touch ${prefix}/${prefix}.index + touch ${prefix}/${prefix}.lookup + touch ${prefix}/${prefix}.source + touch ${prefix}/${prefix}_h + touch ${prefix}/${prefix}_h.dbtype + touch ${prefix}/${prefix}_h.index + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/createdb/meta.yml b/modules/nf-core/mmseqs/createdb/meta.yml new file mode 100644 index 00000000..a011020b --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/meta.yml @@ -0,0 +1,47 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "mmseqs_createdb" +description: Create an MMseqs database from an existing FASTA/Q file +keywords: + - protein sequence + - databases + - clustering + - searching + - indexing + - mmseqs2 +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - sequence: + type: file + description: Input sequences in FASTA/Q (zipped or unzipped) format to parse into an mmseqs database + pattern: "*.{fasta,fasta.gz,fa,fa.gz,fna,fna.gz,fastq,fastq.gz,fq,fq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db: + type: directory + description: The created MMseqs2 database + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Joon-Klaps" +maintainers: + - "@Joon-Klaps" + - "@vagkaratzas" diff --git a/modules/nf-core/mmseqs/createdb/tests/main.nf.test b/modules/nf-core/mmseqs/createdb/tests/main.nf.test new file mode 100644 index 00000000..60d73419 --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/tests/main.nf.test @@ -0,0 +1,58 @@ +nextflow_process { + + name "Test Process MMSEQS_CREATEDB" + script "../main.nf" + process "MMSEQS_CREATEDB" + tag "modules" + tag "modules_nfcore" + tag "mmseqs" + tag "mmseqs/createdb" + + test("Should build an mmseqs db from a contigs fasta file") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.db).match("contig_db") }, + { assert process.out.versions } + ) + } + + } + + test("Should build an mmseqs db from a zipped amino acid sequence file") { + + when { + process { + """ + + input[0] = [ + [ id:'test' ], + file(params.test_data['sarscov2']['genome']['proteome_fasta_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.db).match("protein_gz_db") }, + { assert process.out.versions } + ) + } + + } + +} diff --git a/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap b/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap new file mode 100644 index 00000000..0c600c1f --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "protein_gz_db": { + "content": [ + [ + [ + { + "id": "test" + }, + [ + "test:md5,4b494965ed7ab67da8ca3f39523eb104", + "test.dbtype:md5,152afd7bf4dbe26f85032eee0269201a", + "test.index:md5,46f9d884e9a7f442fe1cd2ce339734e3", + "test.lookup:md5,3e27cb93d9ee875ad42a6f32f5651bdc", + "test.source:md5,eaa64fc8a5f7ec1ee49b0dcbd1a72e9d", + "test_h:md5,6e798b81c70d191f78939c2dd6223a7f", + "test_h.dbtype:md5,8895d3d8e9322aedbf45249dfb3ddb0a", + "test_h.index:md5,d5ac49ff56df064b980fa0eb5da57673" + ] + ] + ] + ], + "timestamp": "2023-11-21T12:10:12.018974702" + }, + "contig_db": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test:md5,7c3c2c5926cf8fa82e66b9628f680256", + "test.dbtype:md5,c8ed20c23ba91f4577f84c940c86c7db", + "test.index:md5,5b2fd8abd0ad3fee24738af7082e6a6e", + "test.lookup:md5,32f88756dbcb6aaf7b239b0d61730f1b", + "test.source:md5,9ada5b3ea6e1a7e16c4418eb98ae8d9d", + "test_h:md5,8c29f5ed94d83d7115e9c8a883ce358d", + "test_h.dbtype:md5,8895d3d8e9322aedbf45249dfb3ddb0a", + "test_h.index:md5,87c7c8c6d16018ebfaa6f408391a5ae2" + ] + ] + ] + ], + "timestamp": "2023-11-21T12:10:04.7348329" + } +} \ No newline at end of file diff --git a/modules/nf-core/mmseqs/createdb/tests/tags.yml b/modules/nf-core/mmseqs/createdb/tests/tags.yml new file mode 100644 index 00000000..1f511ab0 --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/tests/tags.yml @@ -0,0 +1,2 @@ +mmseqs/createdb: + - modules/nf-core/mmseqs/createdb/** diff --git a/modules/nf-core/mmseqs/createtsv/environment.yml b/modules/nf-core/mmseqs/createtsv/environment.yml new file mode 100644 index 00000000..4840fc02 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/environment.yml @@ -0,0 +1,7 @@ +name: mmseqs_createtsv +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::mmseqs2=15.6f452 diff --git a/modules/nf-core/mmseqs/createtsv/main.nf b/modules/nf-core/mmseqs/createtsv/main.nf new file mode 100644 index 00000000..dcd4c13d --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/main.nf @@ -0,0 +1,63 @@ + +process MMSEQS_CREATETSV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': + 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + + input: + tuple val(meta), path(db_result) + tuple val(meta2), path(db_query) + tuple val(meta3), path(db_target) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: "*.dbtype" // database generated by mmyseqs cluster | search | taxonomy | ... + def args3 = task.ext.args ?: "*.dbtype" // database generated by mmyseqs/createdb + def args4 = task.ext.args ?: "*.dbtype" // database generated by mmyseqs/createdb + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + # Extract files with specified args based suffix | remove suffix | isolate longest common substring of files + DB_RESULT_PATH_NAME=\$(find -L "$db_result/" -maxdepth 1 -name "$args2" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + DB_QUERY_PATH_NAME=\$(find -L "$db_query/" -maxdepth 1 -name "$args3" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + DB_TARGET_PATH_NAME=\$(find -L "$db_target/" -maxdepth 1 -name "$args4" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + + mmseqs \\ + createtsv \\ + \$DB_QUERY_PATH_NAME \\ + \$DB_TARGET_PATH_NAME \\ + \$DB_RESULT_PATH_NAME \\ + ${prefix}.tsv \\ + $args \\ + --threads ${task.cpus} \\ + --compressed 1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/createtsv/meta.yml b/modules/nf-core/mmseqs/createtsv/meta.yml new file mode 100644 index 00000000..e85b066f --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/meta.yml @@ -0,0 +1,65 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "mmseqs_createtsv" +description: Create a tsv file from a query and a target database as well as the result database +keywords: + - protein sequence + - databases + - clustering + - searching + - indexing + - mmseqs2 + - tsv +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: ["GPL v3"] +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_result: + type: directory + description: an MMseqs2 database with result data + - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_query: + type: directory + description: an MMseqs2 database with query data + - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_target: + type: directory + description: an MMseqs2 database with target data +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - tsv: + type: file + description: The resulting tsv file created using the query, target and result MMseqs databases + pattern: "*.{tsv}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Joon-Klaps" +maintainers: + - "@Joon-Klaps" diff --git a/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config b/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config new file mode 100644 index 00000000..48fee164 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config @@ -0,0 +1,6 @@ +process { + + withName: MMSEQS_CREATETSV { + ext.args2 = '*_clu.dbtype' + } +} diff --git a/modules/nf-core/mmseqs/createtsv/tests/main.nf.test b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test new file mode 100644 index 00000000..99e79e0c --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test @@ -0,0 +1,248 @@ +nextflow_process { + + name "Test Process MMSEQS_CREATETSV" + script "../main.nf" + process "MMSEQS_CREATETSV" + + tag "modules" + tag "modules_nfcore" + tag "mmseqs" + tag "mmseqs/taxonomy" + tag "mmseqs/createdb" + tag "mmseqs/databases" + tag "untar" + tag "mmseqs/createtsv" + + test("mmseqs/createtsv - bacteroides_fragilis - taxonomy") { + + config "./taxonomy.nextflow.config" + + setup { + run("MMSEQS_CREATEDB", alias: "MMSEQS_TAXA") { + script "../../createdb/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:false ], + file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + ] + """ + } + } + run("MMSEQS_DATABASES") { + script "../../databases/main.nf" + process { + """ + input[0] = 'SILVA' + """ + } + } + run("MMSEQS_TAXONOMY") { + script "../../taxonomy/main.nf" + process { + """ + input[0] = MMSEQS_TAXA.out.db + input[1] = MMSEQS_DATABASES.out.database + """ + } + } + } + when { + process { + """ + input[0] = MMSEQS_TAXONOMY.out.db_taxonomy + input[1] = [[:],[]] + input[2] = MMSEQS_TAXA.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("mmseqs/createtsv - sarscov2 - cluster") { + + config "./cluster.nextflow.config" + + setup { + run("UNTAR", alias: "UNTAR_QUERY") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + run("UNTAR", alias: "UNTAR_TARGET") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_target', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + run("UNTAR", alias: "UNTAR_RESULT") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_result', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + } + + when { + + process { + """ + ch_query = UNTAR_QUERY.out.untar + ch_target = UNTAR_TARGET.out.untar + ch_result = UNTAR_RESULT.out.untar + + input[0] = ch_result + input[1] = ch_query + input[2] = ch_target + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.tsv).match("tsv") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("mmseqs/createtsv - bacteroides_fragilis - taxonomy - stub") { + + options "-stub" + config "./taxonomy.nextflow.config" + + setup { + run("MMSEQS_CREATEDB", alias: "MMSEQS_TAXA") { + script "../../createdb/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:false ], + file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + ] + """ + } + } + run("MMSEQS_DATABASES") { + script "../../databases/main.nf" + process { + """ + input[0] = 'SILVA' + """ + } + } + run("MMSEQS_TAXONOMY") { + script "../../taxonomy/main.nf" + process { + """ + input[0] = MMSEQS_TAXA.out.db + input[1] = MMSEQS_DATABASES.out.database + """ + } + } + } + when { + process { + """ + input[0] = MMSEQS_TAXONOMY.out.db_taxonomy + input[1] = [[:],[]] + input[2] = MMSEQS_TAXA.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("mmseqs/createtsv - sarscov2 - cluster - stub") { + + options "-stub" + config "./cluster.nextflow.config" + + setup { + run("UNTAR", alias: "UNTAR_QUERY") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + run("UNTAR", alias: "UNTAR_TARGET") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_target', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + run("UNTAR", alias: "UNTAR_RESULT") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_result', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + } + + when { + + process { + """ + ch_query = UNTAR_QUERY.out.untar + ch_target = UNTAR_TARGET.out.untar + ch_result = UNTAR_RESULT.out.untar + + input[0] = ch_result + input[1] = ch_query + input[2] = ch_target + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap new file mode 100644 index 00000000..5b8f9569 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap @@ -0,0 +1,137 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:53:02.392516336" + }, + "tsv": { + "content": [ + [ + [ + { + "id": "test_result", + "single_end": true + }, + "test_result.tsv:md5,4e7ba50ce2879660dc6595286bf0d097" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:53:02.311022721" + }, + "mmseqs/createtsv - bacteroides_fragilis - taxonomy - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_query", + "single_end": false + }, + "test_query.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ], + "tsv": [ + [ + { + "id": "test_query", + "single_end": false + }, + "test_query.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:53:11.715695614" + }, + "mmseqs/createtsv - sarscov2 - cluster - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_result", + "single_end": true + }, + "test_result.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ], + "tsv": [ + [ + { + "id": "test_result", + "single_end": true + }, + "test_result.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:53:21.386186911" + }, + "mmseqs/createtsv - bacteroides_fragilis - taxonomy": { + "content": [ + { + "0": [ + [ + { + "id": "test_query", + "single_end": false + }, + "test_query.tsv:md5,9179f5c85b8b87a4dc998c9d17840161" + ] + ], + "1": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ], + "tsv": [ + [ + { + "id": "test_query", + "single_end": false + }, + "test_query.tsv:md5,9179f5c85b8b87a4dc998c9d17840161" + ] + ], + "versions": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:52:51.792345007" + } +} diff --git a/modules/nf-core/mmseqs/createtsv/tests/tags.yml b/modules/nf-core/mmseqs/createtsv/tests/tags.yml new file mode 100644 index 00000000..e27827f5 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/tags.yml @@ -0,0 +1,2 @@ +mmseqs/createtsv: + - "modules/nf-core/mmseqs/createtsv/**" diff --git a/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config b/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config new file mode 100644 index 00000000..f08205d1 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: MMSEQS_TAXONOMY { + ext.args = '--search-type 2' + } + +} diff --git a/modules/nf-core/mmseqs/databases/environment.yml b/modules/nf-core/mmseqs/databases/environment.yml new file mode 100644 index 00000000..3bf8437d --- /dev/null +++ b/modules/nf-core/mmseqs/databases/environment.yml @@ -0,0 +1,7 @@ +name: mmseqs_databases +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::mmseqs2=15.6f452 diff --git a/modules/nf-core/mmseqs/databases/main.nf b/modules/nf-core/mmseqs/databases/main.nf new file mode 100644 index 00000000..3e228b29 --- /dev/null +++ b/modules/nf-core/mmseqs/databases/main.nf @@ -0,0 +1,62 @@ +process MMSEQS_DATABASES { + tag "${database}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': + 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + + input: + val database + + output: + path "${prefix}/" , emit: database + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: 'mmseqs_database' + """ + mkdir ${prefix}/ + + mmseqs databases \\ + ${database} \\ + ${prefix}/database \\ + tmp/ \\ + --threads ${task.cpus} \\ + --compressed 1 \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: 'mmseqs_database' + """ + mkdir ${prefix}/ + + touch ${prefix}/database + touch ${prefix}/database.dbtype + touch ${prefix}/database_h + touch ${prefix}/database_h.dbtype + touch ${prefix}/database_h.index + touch ${prefix}/database.index + touch ${prefix}/database.lookup + touch ${prefix}/database_mapping + touch ${prefix}/database.source + touch ${prefix}/database_taxonomy + touch ${prefix}/database.version + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: /') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/databases/meta.yml b/modules/nf-core/mmseqs/databases/meta.yml new file mode 100644 index 00000000..803a87f6 --- /dev/null +++ b/modules/nf-core/mmseqs/databases/meta.yml @@ -0,0 +1,33 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "mmseqs_databases" +description: Download an mmseqs-formatted database +keywords: + - database + - indexing + - clustering + - searching +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: ["GPL v3"] +input: + - database: + type: string + description: Database available through the mmseqs2 databases interface - see https://github.com/soedinglab/MMseqs2/wiki#downloading-databases for details +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - database: + type: directory + description: Directory containing processed mmseqs database +authors: + - "@prototaxites" +maintainers: + - "@prototaxites" diff --git a/modules/nf-core/mmseqs/taxonomy/environment.yml b/modules/nf-core/mmseqs/taxonomy/environment.yml new file mode 100644 index 00000000..fa40c277 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "mmseqs_taxonomy" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::mmseqs2=15.6f452" diff --git a/modules/nf-core/mmseqs/taxonomy/main.nf b/modules/nf-core/mmseqs/taxonomy/main.nf new file mode 100644 index 00000000..54849885 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/main.nf @@ -0,0 +1,65 @@ +process MMSEQS_TAXONOMY { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': + 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + + input: + tuple val(meta), path(db_query) + path(db_target) + + output: + tuple val(meta), path("${prefix}_taxonomy"), emit: db_taxonomy + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: "*.dbtype" //represents the db_query + def args3 = task.ext.args3 ?: "*.dbtype" //represents the db_target + prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir -p ${prefix}_taxonomy + + # Extract files with specified args based suffix | remove suffix | isolate longest common substring of files + DB_QUERY_PATH_NAME=\$(find -L "${db_query}/" -maxdepth 1 -name "${args2}" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + DB_TARGET_PATH_NAME=\$(find -L "${db_target}/" -maxdepth 1 -name "${args3}" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + + mmseqs \\ + taxonomy \\ + \$DB_QUERY_PATH_NAME \\ + \$DB_TARGET_PATH_NAME \\ + ${prefix}_taxonomy/${prefix} \\ + tmp1 \\ + $args \\ + --threads ${task.cpus} \\ + --compressed 1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir -p ${prefix}_taxonomy + touch ${prefix}_taxonomy/${prefix}.{0..25} + touch ${prefix}_taxonomy/${prefix}.dbtype + touch ${prefix}_taxonomy/${prefix}.index + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/taxonomy/meta.yml b/modules/nf-core/mmseqs/taxonomy/meta.yml new file mode 100644 index 00000000..d836029c --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/meta.yml @@ -0,0 +1,48 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "mmseqs_taxonomy" +description: Computes the lowest common ancestor by identifying the query sequence homologs against the target database. +keywords: + - protein sequence + - nucleotide sequence + - databases + - taxonomy + - homologs + - mmseqs2 +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_query: + type: directory + description: An MMseqs2 database with query data + - db_target: + type: directory + description: an MMseqs2 database with target data including the taxonomy classification +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_taxonomy: + type: directory + description: An MMseqs2 database with target data including the taxonomy classification + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@darcy220606" +maintainers: + - "@darcy220606" diff --git a/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test b/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test new file mode 100644 index 00000000..90b356ae --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test @@ -0,0 +1,81 @@ +nextflow_process { + + name "Test Process MMSEQS_TAXONOMY" + script "../main.nf" + config "./nextflow.config" + process "MMSEQS_TAXONOMY" + + tag "modules" + tag "modules_nfcore" + tag "mmseqs" + tag "mmseqs/taxonomy" + tag "mmseqs/createdb" + tag "mmseqs/databases" + + setup { + run("MMSEQS_CREATEDB") { + script "modules/nf-core/mmseqs/createdb/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:false ], + file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + ] + """ + } + } + + run("MMSEQS_DATABASES") { + script "modules/nf-core/mmseqs/databases/main.nf" + process { + """ + input[0] = 'SILVA' + """ + } + } + } + + test("mmseqs/taxonomy - bacteroides_fragilis - genome_nt") { + when { + process { + """ + input[0] = MMSEQS_CREATEDB.out.db + input[1] = MMSEQS_DATABASES.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.db_taxonomy.get(0).get(1)).list().sort() ).match()}, + { assert process.out.versions } + ) + } + } + + test("mmseqs/taxonomy - bacteroides_fragilis - genome_nt - stub") { + + options "-stub" + + when { + process { + """ + input[0] = MMSEQS_CREATEDB.out.db + input[1] = MMSEQS_DATABASES.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.db_taxonomy.get(0).get(1)).list().sort() ).match()}, + { assert process.out.versions } + ) + } + } +} + diff --git a/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test.snap b/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test.snap new file mode 100644 index 00000000..d1b57c05 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test.snap @@ -0,0 +1,44 @@ +{ + "mmseqs/taxonomy - bacteroides_fragilis - genome_nt": { + "content": [ + "test_query.0", + "test_query.1", + "test_query.dbtype", + "test_query.index" + ], + "timestamp": "2024-02-26T16:35:10.953102408" + }, + "mmseqs/taxonomy - bacteroides_fragilis - genome_nt - stub": { + "content": [ + "test_query.0", + "test_query.1", + "test_query.10", + "test_query.11", + "test_query.12", + "test_query.13", + "test_query.14", + "test_query.15", + "test_query.16", + "test_query.17", + "test_query.18", + "test_query.19", + "test_query.2", + "test_query.20", + "test_query.21", + "test_query.22", + "test_query.23", + "test_query.24", + "test_query.25", + "test_query.3", + "test_query.4", + "test_query.5", + "test_query.6", + "test_query.7", + "test_query.8", + "test_query.9", + "test_query.dbtype", + "test_query.index" + ], + "timestamp": "2024-02-26T16:35:20.111282029" + } +} \ No newline at end of file diff --git a/modules/nf-core/mmseqs/taxonomy/tests/nextflow.config b/modules/nf-core/mmseqs/taxonomy/tests/nextflow.config new file mode 100644 index 00000000..72f6fc81 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: MMSEQS_TAXONOMY { + ext.args = '--search-type 2' + } +} diff --git a/modules/nf-core/mmseqs/taxonomy/tests/tags.yml b/modules/nf-core/mmseqs/taxonomy/tests/tags.yml new file mode 100644 index 00000000..76172197 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/tests/tags.yml @@ -0,0 +1,2 @@ +mmseqs/taxonomy: + - "modules/nf-core/mmseqs/taxonomy/**" diff --git a/nextflow.config b/nextflow.config index a3e86802..88d2f7db 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,205 +10,222 @@ params { // Input options - input = null + input = null + + // Taxonomy classification options + run_taxa_classification = false + taxa_classification_tool = 'mmseqs2' + + taxa_classification_mmseqs_databases_localpath = null + taxa_classification_mmseqs_databases_id = 'Kalamari' + taxa_classification_mmseqs_databases_savetmp = false + + taxa_classification_mmseqs_taxonomy_savetmp = false + taxa_classification_mmseqs_taxonomy_searchtype = 2 + taxa_classification_mmseqs_taxonomy_lcaranks = 'kingdom,phylum,class,order,family,genus,species' + taxa_classification_mmseqs_taxonomy_taxlineage = 1 + taxa_classification_mmseqs_taxonomy_sensitivity = '5.0' + taxa_classification_mmseqs_taxonomy_orffilters = '2.0' + taxa_classification_mmseqs_taxonomy_lcamode = 3 + taxa_classification_mmseqs_taxonomy_votemode = 1 // Annotation options - annotation_tool = 'pyrodigal' - save_annotations = false - - annotation_prodigal_singlemode = false - annotation_prodigal_closed = false - annotation_prodigal_transtable = 11 - annotation_prodigal_forcenonsd = false - - annotation_pyrodigal_singlemode = false - annotation_pyrodigal_closed = false - annotation_pyrodigal_transtable = 11 - annotation_pyrodigal_forcenonsd = false - - annotation_bakta_db_localpath = null - annotation_bakta_db_downloadtype = 'full' - annotation_bakta_singlemode = false - annotation_bakta_mincontiglen = 1 - annotation_bakta_translationtable = 11 - annotation_bakta_gram = '?' - annotation_bakta_complete = false - annotation_bakta_renamecontigheaders = false - annotation_bakta_compliant = false - annotation_bakta_trna = false - annotation_bakta_tmrna = false - annotation_bakta_rrna = false - annotation_bakta_ncrna = false - annotation_bakta_ncrnaregion = false - annotation_bakta_crispr = false - annotation_bakta_skipcds = false - annotation_bakta_pseudo = false - annotation_bakta_skipsorf = false - annotation_bakta_gap = false - annotation_bakta_ori = false - annotation_bakta_activate_plot = false - - annotation_prokka_singlemode = false - annotation_prokka_rawproduct = false - annotation_prokka_kingdom = 'Bacteria' - annotation_prokka_gcode = 11 - annotation_prokka_cdsrnaolap = false - annotation_prokka_rnammer = false - annotation_prokka_mincontiglen = 1 - annotation_prokka_evalue = 0.000001 - annotation_prokka_coverage = 80 - annotation_prokka_compliant = true - annotation_prokka_addgenes = false - annotation_prokka_retaincontigheaders = false + annotation_tool = 'pyrodigal' + save_annotations = false + + annotation_prodigal_singlemode = false + annotation_prodigal_closed = false + annotation_prodigal_transtable = 11 + annotation_prodigal_forcenonsd = false + + annotation_pyrodigal_singlemode = false + annotation_pyrodigal_closed = false + annotation_pyrodigal_transtable = 11 + annotation_pyrodigal_forcenonsd = false + + annotation_bakta_db_localpath = null + annotation_bakta_db_downloadtype = 'full' + annotation_bakta_singlemode = false + annotation_bakta_mincontiglen = 1 + annotation_bakta_translationtable = 11 + annotation_bakta_gram = '?' + annotation_bakta_complete = false + annotation_bakta_renamecontigheaders = false + annotation_bakta_compliant = false + annotation_bakta_trna = false + annotation_bakta_tmrna = false + annotation_bakta_rrna = false + annotation_bakta_ncrna = false + annotation_bakta_ncrnaregion = false + annotation_bakta_crispr = false + annotation_bakta_skipcds = false + annotation_bakta_pseudo = false + annotation_bakta_skipsorf = false + annotation_bakta_gap = false + annotation_bakta_ori = false + annotation_bakta_activate_plot = false + + annotation_prokka_singlemode = false + annotation_prokka_rawproduct = false + annotation_prokka_kingdom = 'Bacteria' + annotation_prokka_gcode = 11 + annotation_prokka_cdsrnaolap = false + annotation_prokka_rnammer = false + annotation_prokka_mincontiglen = 1 + annotation_prokka_evalue = 0.000001 + annotation_prokka_coverage = 80 + annotation_prokka_compliant = true + annotation_prokka_addgenes = false + annotation_prokka_retaincontigheaders = false // Database downloading options - save_databases = false + save_databases = false // AMP options - run_amp_screening = false + run_amp_screening = false - amp_skip_amplify = false + amp_skip_amplify = false - amp_skip_macrel = false + amp_skip_macrel = false - amp_skip_ampir = false - amp_ampir_model = 'precursor' - amp_ampir_minlength = 10 + amp_skip_ampir = false + amp_ampir_model = 'precursor' + amp_ampir_minlength = 10 - amp_skip_hmmsearch = false - amp_hmmsearch_models = null - amp_hmmsearch_savealignments = false - amp_hmmsearch_savetargets = false - amp_hmmsearch_savedomains = false + amp_skip_hmmsearch = false + amp_hmmsearch_models = null + amp_hmmsearch_savealignments = false + amp_hmmsearch_savetargets = false + amp_hmmsearch_savedomains = false - amp_ampcombi_db = null - amp_ampcombi_cutoff = 0 + amp_ampcombi_db = null + amp_ampcombi_cutoff = 0 // ARG options - run_arg_screening = false - - arg_skip_fargene = false - arg_fargene_hmmmodel = 'class_a,class_b_1_2,class_b_3,class_c,class_d_1,class_d_2,qnr,tet_efflux,tet_rpg,tet_enzyme' - arg_fargene_savetmpfiles = false - arg_fargene_minorflength = 90 - arg_fargene_score = null - arg_fargene_translationformat = 'pearson' - arg_fargene_orffinder = false - - arg_skip_rgi = false - arg_rgi_savejson = false - arg_rgi_savetmpfiles = false - arg_rgi_alignmenttool = 'BLAST' - arg_rgi_includeloose = false - arg_rgi_includenudge = false - arg_rgi_lowquality = false - arg_rgi_data = 'NA' - arg_rgi_split_prodigal_jobs = true - - arg_skip_amrfinderplus = false - arg_amrfinderplus_db = null - arg_amrfinderplus_identmin = -1 - arg_amrfinderplus_coveragemin = 0.5 - arg_amrfinderplus_translationtable = 11 - arg_amrfinderplus_plus = false - arg_amrfinderplus_name = false - - arg_skip_deeparg = false - arg_deeparg_data = null - arg_deeparg_data_version = 2 // Make sure to update on module version bump! - arg_deeparg_model = 'LS' - arg_deeparg_minprob = 0.8 - arg_deeparg_alignmentidentity = 50 - arg_deeparg_alignmentevalue = 1e-10 - arg_deeparg_alignmentoverlap = 0.8 - arg_deeparg_numalignmentsperentry = 1000 - - arg_skip_abricate = false - arg_abricate_db = 'ncbi' - arg_abricate_minid = 80 - arg_abricate_mincov = 80 - - arg_hamronization_summarizeformat = 'tsv' + run_arg_screening = false + + arg_skip_fargene = false + arg_fargene_hmmmodel = 'class_a,class_b_1_2,class_b_3,class_c,class_d_1,class_d_2,qnr,tet_efflux,tet_rpg,tet_enzyme' + arg_fargene_savetmpfiles = false + arg_fargene_minorflength = 90 + arg_fargene_score = null + arg_fargene_translationformat = 'pearson' + arg_fargene_orffinder = false + + arg_skip_rgi = false + arg_rgi_savejson = false + arg_rgi_savetmpfiles = false + arg_rgi_alignmenttool = 'BLAST' + arg_rgi_includeloose = false + arg_rgi_includenudge = false + arg_rgi_lowquality = false + arg_rgi_data = 'NA' + arg_rgi_split_prodigal_jobs = true + + arg_skip_amrfinderplus = false + arg_amrfinderplus_db = null + arg_amrfinderplus_identmin = -1 + arg_amrfinderplus_coveragemin = 0.5 + arg_amrfinderplus_translationtable = 11 + arg_amrfinderplus_plus = false + arg_amrfinderplus_name = false + + arg_skip_deeparg = false + arg_deeparg_data = null + arg_deeparg_data_version = 2 // Make sure to update on module version bump! + arg_deeparg_model = 'LS' + arg_deeparg_minprob = 0.8 + arg_deeparg_alignmentidentity = 50 + arg_deeparg_alignmentevalue = 1e-10 + arg_deeparg_alignmentoverlap = 0.8 + arg_deeparg_numalignmentsperentry = 1000 + + arg_skip_abricate = false + arg_abricate_db = 'ncbi' + arg_abricate_minid = 80 + arg_abricate_mincov = 80 + + arg_hamronization_summarizeformat = 'tsv' // BGC options - run_bgc_screening = false - - bgc_skip_antismash = false - bgc_antismash_databases = null - bgc_antismash_installationdirectory = null - bgc_antismash_cbgeneral = false - bgc_antismash_cbknownclusters = false - bgc_antismash_cbsubclusters = false - bgc_antismash_smcogtrees = false - bgc_antismash_ccmibig = false - bgc_antismash_contigminlength = 1000 - bgc_antismash_hmmdetectionstrictness = 'relaxed' - bgc_antismash_taxon = 'bacteria' - bgc_antismash_sampleminlength = 1000 - - bgc_skip_deepbgc = false - bgc_deepbgc_database = null - bgc_deepbgc_score = 0.5 - bgc_deepbgc_prodigalsinglemode = false - bgc_deepbgc_mergemaxproteingap = 0 - bgc_deepbgc_mergemaxnuclgap = 0 - bgc_deepbgc_minnucl = 1 - bgc_deepbgc_minproteins = 1 - bgc_deepbgc_mindomains = 1 - bgc_deepbgc_minbiodomains = 0 - bgc_deepbgc_classifierscore = 0.5 - - bgc_skip_gecco = false - bgc_gecco_cds = 3 - bgc_gecco_threshold = 0.8 - bgc_gecco_pfilter = 0.000000001 - bgc_gecco_edgedistance = 0 - bgc_gecco_mask = false - - bgc_skip_hmmsearch = false - bgc_hmmsearch_models = null - bgc_hmmsearch_savealignments = false - bgc_hmmsearch_savetargets = false - bgc_hmmsearch_savedomains = false + run_bgc_screening = false + + bgc_skip_antismash = false + bgc_antismash_databases = null + bgc_antismash_installationdirectory = null + bgc_antismash_cbgeneral = false + bgc_antismash_cbknownclusters = false + bgc_antismash_cbsubclusters = false + bgc_antismash_smcogtrees = false + bgc_antismash_ccmibig = false + bgc_antismash_contigminlength = 1000 + bgc_antismash_hmmdetectionstrictness = 'relaxed' + bgc_antismash_taxon = 'bacteria' + bgc_antismash_sampleminlength = 1000 + + bgc_skip_deepbgc = false + bgc_deepbgc_database = null + bgc_deepbgc_score = 0.5 + bgc_deepbgc_prodigalsinglemode = false + bgc_deepbgc_mergemaxproteingap = 0 + bgc_deepbgc_mergemaxnuclgap = 0 + bgc_deepbgc_minnucl = 1 + bgc_deepbgc_minproteins = 1 + bgc_deepbgc_mindomains = 1 + bgc_deepbgc_minbiodomains = 0 + bgc_deepbgc_classifierscore = 0.5 + + bgc_skip_gecco = false + bgc_gecco_cds = 3 + bgc_gecco_threshold = 0.8 + bgc_gecco_pfilter = 0.000000001 + bgc_gecco_edgedistance = 0 + bgc_gecco_mask = false + + bgc_skip_hmmsearch = false + bgc_hmmsearch_models = null + bgc_hmmsearch_savealignments = false + bgc_hmmsearch_savetargets = false + bgc_hmmsearch_savedomains = false // MultiQC options - multiqc_config = null - multiqc_title = null - multiqc_logo = null - max_multiqc_email_size = '25.MB' - multiqc_methods_description = null + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Boilerplate options - outdir = null - publish_dir_mode = 'copy' - email = null - email_on_fail = null - plaintext_email = false - monochrome_logs = false - hook_url = null - help = false - version = false + outdir = null + publish_dir_mode = 'copy' + email = null + email_on_fail = null + plaintext_email = false + monochrome_logs = false + hook_url = null + help = false + version = false // Config options - config_profile_name = null - config_profile_description = null - custom_config_version = 'master' - custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_contact = null - config_profile_url = null + config_profile_name = null + config_profile_description = null + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + config_profile_contact = null + config_profile_url = null // Max resource options // Defaults only, expecting to be overwritten - max_memory = '128.GB' - max_cpus = 16 - max_time = '240.h' + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' // Schema validation default options - validationFailUnrecognisedParams = false - validationLenientMode = false - validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta' - validationShowHiddenParams = false - validate_params = true + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta' + validationShowHiddenParams = false + validate_params = true } @@ -323,11 +340,12 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_bgc { includeConfig 'conf/test_bgc.config' } - test_full { includeConfig 'conf/test_full.config' } - test_deeparg { includeConfig 'conf/test_deeparg.config' } - test_nothing { includeConfig 'conf/test_nothing.config' } + test { includeConfig 'conf/test.config' } + test_bgc { includeConfig 'conf/test_bgc.config' } + test_taxonomy { includeConfig 'conf/test_taxonomy.config' } + test_full { includeConfig 'conf/test_full.config' } + test_deeparg { includeConfig 'conf/test_deeparg.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index 55149c6c..31678cd0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -68,6 +68,119 @@ }, "fa_icon": "fas fa-network-wired" }, + "taxonomic_classification": { + "title": "Taxonomic classification", + "type": "object", + "description": "These options influence whether to activate the taxonomic classification of the input nucleotide sequences.", + "default": "", + "properties": { + "run_taxa_classification": { + "type": "boolean", + "description": "Activates the taxonomic classification of input nucleotide sequences.", + "help_text": "This flag turns on the taxonomic classification of input nucleotide sequences. The taxonomic annotations should be turned on if the input metagenomes' bacterial sources are unknown, which can help identify the source of the AMP, BGC or ARG hit obtained for laboratory experiments. This flag should be turned off (which is by default) if the input nucleotide sequences represent a single known genome or *nf-core/mag* was run beforehand. Turning on this flag relatively decreases the pipeline speed and requires >8GB RAM. Due to the size of the resulting table, the final complete summary is in a zipped format.", + "fa_icon": "fas fa-ad" + }, + "taxa_classification_tool": { + "type": "string", + "default": "mmseqs2", + "help_text": "This flag specifies which tool for taxonomic classification should be activated. At the moment only 'MMseqs2' is incorporated in the pipeline.", + "description": "Specifies the tool used for taxonomic classification.", + "fa_icon": "fas fa-check-circle" + } + }, + "fa_icon": "fas fa-address-book" + }, + "taxonomic_classification_mmseqs2_databases": { + "title": "Taxonomic classification: MMseqs2 databases", + "type": "object", + "description": "These parameters influence the database to be used in classifying the taxonomy.", + "default": "", + "properties": { + "taxa_classification_mmseqs_databases_localpath": { + "description": "Specify a path to MMseqs2-formatted database.", + "help_text": "Specify a path to a database that is prepared in MMseqs2 format as detailed in the [documentation](https://mmseqs.com/latest/userguide.pdf).", + "fa_icon": "fab fa-stackpath" + }, + "taxa_classification_mmseqs_databases_id": { + "type": "string", + "default": "Kalamari", + "help_text": "Specify which MMseqs2-formatted database to use to classify the input contigs. This can be a nucleotide or amino acid database that includes taxonomic classifications. For example, both GTDB (an amico acid database) and SILVA (a nucleotide database) are supported by MMseqs2. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases \".", + "description": "Specify the label of the database to be used.", + "fa_icon": "fas fa-address-card" + }, + "taxa_classification_mmseqs_databases_savetmp": { + "type": "boolean", + "help_text": "This flag saves the temporary files from downloading the database and formatting it in the MMseqs2 format into the output folder. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases: `--remove-tmp-files`\".", + "description": "Specify whether the temporary files should be saved.", + "fa_icon": "fas fa-file-download" + } + }, + "fa_icon": "far fa-address-card" + }, + "taxonomic_classification_mmseqs2_taxonomy": { + "title": "Taxonomic classification: MMseqs2 taxonomy", + "type": "object", + "description": "These parameters influence the taxonomic classification step.", + "default": "", + "properties": { + "taxa_classification_mmseqs_taxonomy_savetmp": { + "type": "boolean", + "help_text": "This flag saves the temporary files from creating the taxonomy database and the final `tsv` file into the output folder. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--remove-tmp-files`\".\n", + "description": "Specify whether to save the temporary files.", + "fa_icon": "fab fa-adversal" + }, + "taxa_classification_mmseqs_taxonomy_searchtype": { + "type": "integer", + "default": 2, + "help_text": "Specify the type of alignment to be carried out between the query database and the reference MMseqs2 database. This can be set to '0' for automatic detection, '1' for amino acid alignment, '2' for translating the inputs and running the alignment on the translated sequences, '3' nucleotide based alignment and '4' for the translated nucleotide sequences alignment. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--search-type`\".", + "description": "Specify the alignment type between database and query.", + "fa_icon": "fas fa-align-center" + }, + "taxa_classification_mmseqs_taxonomy_lcaranks": { + "type": "string", + "default": "kingdom,phylum,class,order,family,genus,species", + "help_text": "Specify the taxonomic ranks to include in the taxonomic lineage column in the final `.tsv` file. For example, 'kingdom,phylum,class,order,family,genus,species'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-ranks`\".", + "description": "Specify the taxonomic levels to display in the result table.", + "fa_icon": "fas fa-stream" + }, + "taxa_classification_mmseqs_taxonomy_taxlineage": { + "type": "integer", + "default": 1, + "help_text": "This flag specifies whether the taxonomic lineage should be included in the output `.tsv` file. The taxonomic lineage is obtained from the internal module of `mmseqs/taxonomy` that infers the last common ancestor to classify the taxonomy. A value of '0' writes no taxonomic lineage, a value of '1' adds a column with the full lineage names prefixed with abbreviation of the lineage level, e.g. k_Prokaryotes;p_Bacteroidetes;c_....;o_....;f_....;g_....;s_...., while a value of '2' adds a column with the full NCBI taxids lineage,e.g. 1324;2345;4546;5345. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--tax-lineage`\".", + "description": "Specify whether to include or remove the taxonomic lineage.", + "fa_icon": "fab fa-audible" + }, + "taxa_classification_mmseqs_taxonomy_sensitivity": { + "type": "string", + "default": "5.0", + "help_text": "This flag specifies the speed and sensitivity of the taxonomic search. It stands for how many kmers should be produced during the preliminary seeding stage. A very fast search requires a low value e.g. '1.0' and a a very sensitive search requires e.g. '7.0'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--s`\".", + "description": "Specify the speed and sensitivity for taxonomy assignment.", + "fa_icon": "fas fa-history" + }, + "taxa_classification_mmseqs_taxonomy_orffilters": { + "type": "string", + "default": "2.0", + "help_text": "This flag specifies the sensitivity used for prefiltering the query ORF. Before the taxonomy-assigning step, MMseqs2 searches the predicted ORFs against the provided database. This value influences the speed with which the search is carried out. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--orf-filter-s`\".", + "description": "Specify the ORF search sensitivity in the prefilter step.", + "fa_icon": "fas fa-clock" + }, + "taxa_classification_mmseqs_taxonomy_lcamode": { + "type": "integer", + "default": 3, + "help_text": "This flag specifies the strategy used for assigning the last common ancestor (LCA). MMseqs2 assigns taxonomy based on an accelerated approximation of the 2bLCA protocol and uses the value of '3'. In this mode, the taxonomic assignment is based not only on usual alignment parameters but also considers the taxonomic classification of the LCA. When the value '4' is used the LCA is assigned based on all the equal scoring top hits. If the value '1' is used the LCA assignment is disregarded and the taxonomic assignment is based on usual alignment parameters like E-value and coverage. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-mode`\".", + "description": "Specify the mode to assign the taxonomy.", + "fa_icon": "fas fa-broom" + }, + "taxa_classification_mmseqs_taxonomy_votemode": { + "type": "integer", + "default": 1, + "help_text": "This flag assigns the mode value with which the weights are computed. The value of '0' stands for uniform weights of taxonomy assignments, the value of '1' uses the minus log E-value and '2' the actual score. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--vote-mode`\".", + "description": "Specify the weights of the taxonomic assignment.", + "fa_icon": "fas fa-poll" + } + }, + "fa_icon": "fas fa-tag" + }, "annotation": { "title": "Annotation", "type": "object", @@ -87,7 +200,8 @@ "fa_icon": "fas fa-save" } }, - "fa_icon": "fas fa-file-signature" + "fa_icon": "fas fa-file-signature", + "help_text": "" }, "annotation_bakta": { "title": "Annotation: BAKTA", @@ -1314,15 +1428,21 @@ } }, "allOf": [ - { - "$ref": "#/definitions/annotation_pyrodigal" - }, { "$ref": "#/definitions/input_output_options" }, { "$ref": "#/definitions/screening_type_activation" }, + { + "$ref": "#/definitions/taxonomic_classification" + }, + { + "$ref": "#/definitions/taxonomic_classification_mmseqs2_databases" + }, + { + "$ref": "#/definitions/taxonomic_classification_mmseqs2_taxonomy" + }, { "$ref": "#/definitions/annotation" }, @@ -1335,6 +1455,9 @@ { "$ref": "#/definitions/annotation_prodigal" }, + { + "$ref": "#/definitions/annotation_pyrodigal" + }, { "$ref": "#/definitions/database_downloading_options" }, diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index b5fb5845..30f4a171 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -9,12 +9,14 @@ include { AMPIR } from '.. include { DRAMP_DOWNLOAD } from '../../modules/local/dramp_download' include { AMPCOMBI } from '../../modules/nf-core/ampcombi/main' include { GUNZIP as GUNZIP_MACREL_PRED ; GUNZIP as GUNZIP_MACREL_ORFS } from '../../modules/nf-core/gunzip/main' -include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { TABIX_BGZIP as AMP_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { MERGE_TAXONOMY_AMPCOMBI } from '../../modules/local/merge_taxonomy_ampcombi' workflow AMP { take: contigs // tuple val(meta), path(contigs) faa // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) + tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -102,20 +104,27 @@ workflow AMP { AMPCOMBI( ch_input_for_ampcombi.input, ch_input_for_ampcombi.faa, ch_ampcombi_input_db ) ch_versions = ch_versions.mix( AMPCOMBI.out.versions ) - ch_ampcombi_summaries = ch_ampcombi_summaries.mix( AMPCOMBI.out.csv ) //AMPCOMBI concatenation - ch_ampcombi_summaries_out = ch_ampcombi_summaries - .multiMap{ - input: [ it[0] ] - summary: it[1] - } + if ( !params.run_taxa_classification ) { + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.tsv', storeDir: "${params.outdir}/reports/ampcombi",keepHeader:true ) + } else { + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.tsv', keepHeader:true ) + } - ch_tabix_input = Channel.of( [ 'id':'ampcombi_complete_summary' ] ) - .combine( ch_ampcombi_summaries_out.summary.collectFile( name: 'ampcombi_complete_summary.csv', keepHeader:true ) ) + // MERGE_TAXONOMY + if ( params.run_taxa_classification ) { - TABIX_BGZIP( ch_tabix_input ) - ch_versions = ch_versions.mix( TABIX_BGZIP.out.versions ) + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_AMPCOMBI(ch_ampcombi_summaries, ch_mmseqs_taxonomy_list) + ch_versions = ch_versions.mix(MERGE_TAXONOMY_AMPCOMBI.out.versions) + + ch_tabix_input = Channel.of( [ 'id':'ampcombi_complete_summary_taxonomy' ] ) + .combine(MERGE_TAXONOMY_AMPCOMBI.out.tsv) + + AMP_TABIX_BGZIP( ch_tabix_input ) + ch_versions = ch_versions.mix( AMP_TABIX_BGZIP.out.versions ) + } emit: versions = ch_versions diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index 97884b2f..df781661 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -2,26 +2,29 @@ Run ARG screening tools */ -include { ABRICATE_RUN } from '../../modules/nf-core/abricate/run/main' -include { AMRFINDERPLUS_UPDATE } from '../../modules/nf-core/amrfinderplus/update/main' -include { AMRFINDERPLUS_RUN } from '../../modules/nf-core/amrfinderplus/run/main' -include { DEEPARG_DOWNLOADDATA } from '../../modules/nf-core/deeparg/downloaddata/main' -include { DEEPARG_PREDICT } from '../../modules/nf-core/deeparg/predict/main' -include { FARGENE } from '../../modules/nf-core/fargene/main' -include { HAMRONIZATION_ABRICATE } from '../../modules/nf-core/hamronization/abricate/main' -include { HAMRONIZATION_RGI } from '../../modules/nf-core/hamronization/rgi/main' -include { HAMRONIZATION_DEEPARG } from '../../modules/nf-core/hamronization/deeparg/main' -include { HAMRONIZATION_AMRFINDERPLUS } from '../../modules/nf-core/hamronization/amrfinderplus/main' -include { HAMRONIZATION_FARGENE } from '../../modules/nf-core/hamronization/fargene/main' -include { HAMRONIZATION_SUMMARIZE } from '../../modules/nf-core/hamronization/summarize/main' -include { RGI_CARDANNOTATION } from '../../modules/nf-core/rgi/cardannotation/main' -include { RGI_MAIN } from '../../modules/nf-core/rgi/main/main' -include { UNTAR } from '../../modules/nf-core/untar/main' +include { ABRICATE_RUN } from '../../modules/nf-core/abricate/run/main' +include { AMRFINDERPLUS_UPDATE } from '../../modules/nf-core/amrfinderplus/update/main' +include { AMRFINDERPLUS_RUN } from '../../modules/nf-core/amrfinderplus/run/main' +include { DEEPARG_DOWNLOADDATA } from '../../modules/nf-core/deeparg/downloaddata/main' +include { DEEPARG_PREDICT } from '../../modules/nf-core/deeparg/predict/main' +include { FARGENE } from '../../modules/nf-core/fargene/main' +include { HAMRONIZATION_ABRICATE } from '../../modules/nf-core/hamronization/abricate/main' +include { HAMRONIZATION_RGI } from '../../modules/nf-core/hamronization/rgi/main' +include { HAMRONIZATION_DEEPARG } from '../../modules/nf-core/hamronization/deeparg/main' +include { HAMRONIZATION_AMRFINDERPLUS } from '../../modules/nf-core/hamronization/amrfinderplus/main' +include { HAMRONIZATION_FARGENE } from '../../modules/nf-core/hamronization/fargene/main' +include { HAMRONIZATION_SUMMARIZE } from '../../modules/nf-core/hamronization/summarize/main' +include { RGI_CARDANNOTATION } from '../../modules/nf-core/rgi/cardannotation/main' +include { RGI_MAIN } from '../../modules/nf-core/rgi/main/main' +include { UNTAR } from '../../modules/nf-core/untar/main' +include { TABIX_BGZIP as ARG_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { MERGE_TAXONOMY_HAMRONIZATION } from '../../modules/local/merge_taxonomy_hamronization' workflow ARG { take: contigs // tuple val(meta), path(contigs) annotations + tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -153,6 +156,20 @@ workflow ARG { HAMRONIZATION_SUMMARIZE( ch_input_for_hamronization_summarize, params.arg_hamronization_summarizeformat ) ch_versions = ch_versions.mix( HAMRONIZATION_SUMMARIZE.out.versions ) + // MERGE_TAXONOMY + if ( params.run_taxa_classification ) { + + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_HAMRONIZATION( HAMRONIZATION_SUMMARIZE.out.tsv, ch_mmseqs_taxonomy_list ) + ch_versions = ch_versions.mix( MERGE_TAXONOMY_HAMRONIZATION.out.versions ) + + ch_tabix_input = Channel.of( [ 'id':'hamronization_combined_report' ] ) + .combine(MERGE_TAXONOMY_HAMRONIZATION.out.tsv) + + ARG_TABIX_BGZIP( ch_tabix_input ) + ch_versions = ch_versions.mix( ARG_TABIX_BGZIP.out.versions ) + } + emit: versions = ch_versions } diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index ed5c103b..3626c283 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -12,6 +12,8 @@ include { HMMER_HMMSEARCH as BGC_HMMER_HMMSEARCH } from '../../modules/nf-core include { DEEPBGC_DOWNLOAD } from '../../modules/nf-core/deepbgc/download/main' include { DEEPBGC_PIPELINE } from '../../modules/nf-core/deepbgc/pipeline/main' include { COMBGC } from '../../modules/local/combgc' +include { TABIX_BGZIP as BGC_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { MERGE_TAXONOMY_COMBGC } from '../../modules/local/merge_taxonomy_combgc' workflow BGC { @@ -20,6 +22,7 @@ workflow BGC { gff // tuple val(meta), path(.out.gff) faa // tuple val(meta), path(.out.faa) gbk // tuple val(meta), path(.out.gbk) + tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -184,7 +187,26 @@ workflow BGC { COMBGC ( ch_bgcresults_for_combgc ) ch_versions = ch_versions.mix( COMBGC.out.versions ) - ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile( name: 'combgc_complete_summary.tsv', storeDir: "${params.outdir}/reports/combgc", keepHeader:true ) + // COMBGC concatenation + if ( !params.run_taxa_classification ) { + ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile( name: 'combgc_complete_summary.tsv', storeDir: "${params.outdir}/reports/combgc", keepHeader:true ) + } else { + ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile( name: 'combgc_complete_summary.tsv', keepHeader:true ) + } + + // MERGE_TAXONOMY + if ( params.run_taxa_classification ) { + + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_COMBGC( ch_combgc_summaries, ch_mmseqs_taxonomy_list ) + ch_versions = ch_versions.mix( MERGE_TAXONOMY_COMBGC.out.versions ) + + ch_tabix_input = Channel.of( [ 'id':'combgc_complete_summary_taxonomy' ] ) + .combine(MERGE_TAXONOMY_COMBGC.out.tsv) + + BGC_TABIX_BGZIP( ch_tabix_input ) + ch_versions = ch_versions.mix( BGC_TABIX_BGZIP.out.versions ) + } emit: versions = ch_versions diff --git a/subworkflows/local/taxa_class.nf b/subworkflows/local/taxa_class.nf new file mode 100644 index 00000000..ec9f273a --- /dev/null +++ b/subworkflows/local/taxa_class.nf @@ -0,0 +1,55 @@ +/* + TAXONOMIC CLASSIFICATION +*/ + +include { MMSEQS_CREATEDB } from '../../modules/nf-core/mmseqs/createdb/main' +include { MMSEQS_DATABASES } from '../../modules/nf-core/mmseqs/databases/main' +include { MMSEQS_TAXONOMY } from '../../modules/nf-core/mmseqs/taxonomy/main' +include { MMSEQS_CREATETSV } from '../../modules/nf-core/mmseqs/createtsv/main' + +workflow TAXA_CLASS { + take: + contigs // tuple val(meta), path(contigs) + + main: + ch_versions = Channel.empty() + ch_mmseqs_db = Channel.empty() + ch_taxonomy_querydb = Channel.empty() + ch_taxonomy_querydb_taxdb = Channel.empty() + ch_taxonomy_tsv = Channel.empty() + + if ( params.taxa_classification_tool == 'mmseqs2') { + + // Download the ref db if not supplied by user + // MMSEQS_DATABASE + if ( params.taxa_classification_mmseqs_databases_localpath != null ) { + ch_mmseqs_db = Channel + .fromPath( params.taxa_classification_mmseqs_databases_localpath ) + .first() + } else { + MMSEQS_DATABASES ( params.taxa_classification_mmseqs_databases_id ) + ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions ) + ch_mmseqs_db = ( MMSEQS_DATABASES.out.database ) + } + + // Create db for query contigs, assign taxonomy and convert to table format + // MMSEQS_CREATEDB + MMSEQS_CREATEDB ( contigs ) + ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions ) + ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db + + // MMSEQS_TAXONOMY + MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) + ch_versions = ch_versions.mix( MMSEQS_TAXONOMY.out.versions ) + ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy + + // MMSEQS_CREATETSV + MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) + ch_versions = ch_versions.mix( MMSEQS_CREATETSV.out.versions ) + ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv + } + + emit: + versions = ch_versions + sample_taxonomy = ch_taxonomy_tsv // channel: [ val(meta), tsv ] +} diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 4ae48307..45514819 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -29,9 +29,10 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { AMP } from '../subworkflows/local/amp' -include { ARG } from '../subworkflows/local/arg' -include { BGC } from '../subworkflows/local/bgc' +include { AMP } from '../subworkflows/local/amp' +include { ARG } from '../subworkflows/local/arg' +include { BGC } from '../subworkflows/local/bgc' +include { TAXA_CLASS } from '../subworkflows/local/taxa_class' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -105,6 +106,26 @@ workflow FUNCSCAN { [ meta, fasta ] } + /* + TAXONOMIC CLASSIFICATION + */ + + // The final subworkflow reports need taxonomic classification. + // This can be either on NT or AA level depending on annotation. + // TODO: Only NT at the moment. AA tax. classification will be added only when its PR is merged. + if ( params.run_taxa_classification ) { + TAXA_CLASS ( ch_prepped_input ) + ch_versions = ch_versions.mix( TAXA_CLASS.out.versions ) + ch_taxonomy_tsv = TAXA_CLASS.out.sample_taxonomy + + } else { + + ch_mmseqs_db = Channel.empty() + ch_taxonomy_querydb = Channel.empty() + ch_taxonomy_querydb_taxdb = Channel.empty() + ch_taxonomy_tsv = Channel.empty() + } + /* ANNOTATION */ @@ -189,7 +210,7 @@ workflow FUNCSCAN { /* AMPs */ - if ( params.run_amp_screening ) { + if ( params.run_amp_screening && !params.run_taxa_classification ) { AMP ( ch_prepped_input, ch_annotation_faa @@ -197,17 +218,40 @@ workflow FUNCSCAN { meta, file -> if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() - } + + }, + ch_taxonomy_tsv ) ch_versions = ch_versions.mix(AMP.out.versions) + } else if ( params.run_amp_screening && params.run_taxa_classification ) { + AMP ( + ch_prepped_input, + ch_annotation_faa + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_taxonomy_tsv + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + !file.isEmpty() + } + ) + ch_versions = ch_versions.mix( AMP.out.versions ) } /* ARGs */ - if ( params.run_arg_screening ) { + if ( params.run_arg_screening && !params.run_taxa_classification ) { if ( params.arg_skip_deeparg ) { - ARG ( ch_prepped_input, [] ) + ARG ( + ch_prepped_input, + [], + ch_taxonomy_tsv + ) } else { ARG ( ch_prepped_input, @@ -216,7 +260,38 @@ workflow FUNCSCAN { meta, file -> if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() + }, + ch_taxonomy_tsv + ) + } + ch_versions = ch_versions.mix( ARG.out.versions ) + } else if ( params.run_arg_screening && params.run_taxa_classification ) { + if ( params.arg_skip_deeparg ) { + ARG ( + ch_prepped_input, + [], + ch_taxonomy_tsv + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + !file.isEmpty() } + ) + } else { + ARG ( + ch_prepped_input, + ch_annotation_faa + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_taxonomy_tsv + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + !file.isEmpty() + } ) } ch_versions = ch_versions.mix( ARG.out.versions ) @@ -225,7 +300,31 @@ workflow FUNCSCAN { /* BGCs */ - if ( params.run_bgc_screening ) { + if ( params.run_bgc_screening && !params.run_taxa_classification ) { + BGC ( + ch_prepped_input, + ch_annotation_gff + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GFF file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_annotation_faa + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_annotation_gbk + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_taxonomy_tsv + ) + ch_versions = ch_versions.mix( BGC.out.versions ) + } else if ( params.run_bgc_screening && params.run_taxa_classification ) { BGC ( ch_prepped_input, ch_annotation_gff @@ -245,10 +344,17 @@ workflow FUNCSCAN { meta, file -> if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() + }, + ch_taxonomy_tsv + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + !file.isEmpty() } ) ch_versions = ch_versions.mix( BGC.out.versions ) } + // // Collate and save software versions //