From d3e27929bc3a599bb695af2681b620e4dbe52eda Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 7 Aug 2024 14:59:35 -0400 Subject: [PATCH 01/15] Enhanced pipeline logic to support user-defined --- assets/samplesheet.csv | 8 ++++---- assets/schema_input.json | 9 +++++++-- conf/iridanext.config | 1 + conf/test.config | 2 +- workflows/speciesabundance.nf | 33 +++++++++++++++++++++++++-------- 5 files changed, 38 insertions(+), 15 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 9986717..b8c2da1 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ -sample,fastq_1,fastq_2 -SAMPLE1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz -SAMPLE2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz -SAMPLE3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, +sample,sample_name,fastq_1,fastq_2 +SAMPLE1,A1 sample,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz +SAMPLE2,B2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz +SAMPLE3,B2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, diff --git a/assets/schema_input.json b/assets/schema_input.json index 5934ed1..6792cf4 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -10,10 +10,15 @@ "sample": { "type": "string", "pattern": "^\\S+$", - "meta": ["id"], + "meta": ["irida_id"], "unique": true, "errorMessage": "Sample name must be provided and cannot contain spaces" }, + "sample_name": { + "type": "string", + "meta": ["id"], + "errorMessage": "Sample name must be provided" + }, "fastq_1": { "type": "string", "pattern": "^\\S+\\.f(ast)?q(\\.gz)?$", @@ -33,6 +38,6 @@ ] } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "sample_name", "fastq_1"] } } diff --git a/conf/iridanext.config b/conf/iridanext.config index 481bf20..f2352ca 100644 --- a/conf/iridanext.config +++ b/conf/iridanext.config @@ -4,6 +4,7 @@ iridanext { path = "${params.outdir}/iridanext.output.json.gz" overwrite = true files { + idkey = "irida_id" global = [ "**/failure/failures_report.csv" ] diff --git a/conf/test.config b/conf/test.config index 92c0bd3..e64c7a9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,6 +20,6 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/assets/samplesheet.csv' + input = "${projectDir}/assets/samplesheet.csv" database = "${projectDir}/tests/data/minidb" } diff --git a/workflows/speciesabundance.nf b/workflows/speciesabundance.nf index eb7e250..42974f7 100644 --- a/workflows/speciesabundance.nf +++ b/workflows/speciesabundance.nf @@ -68,19 +68,36 @@ workflow SpAnce { ch_versions = Channel.empty() + // Track processed IDs + def processedIDs = [] as Set + // Create a new channel of metadata from a sample sheet // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") + .map { meta, fastq_1, fastq_2 -> + // Replace spaces in 'id' with underscores + if (meta.id) { + meta.id = meta.id.replaceAll(/\s+/, '_') + } + // Ensure ID is unique by appending meta.irida_id if needed + while (processedIDs.contains(meta.id)) { + meta.id = "${meta.id}_${meta.irida_id}" + } + // Add the ID to the set of processed IDs + processedIDs << meta.id + // Return the adjusted tuple + return [meta, fastq_1, fastq_2] + } // Map the inputs so that they conform to the nf-core-expected "reads" format. // Either [meta, [fastq_1]] or [meta, [fastq_1, fastq_2]] if fastq_2 exists .map { meta, fastq_1, fastq_2 -> - if (fastq_2) { - meta.single_end = false - tuple(meta, [ file(fastq_1), file(fastq_2) ]) - } else { - meta.single_end = true - tuple(meta, [ file(fastq_1) ]) - } + if (fastq_2) { + meta.single_end = false + tuple(meta, [ file(fastq_1), file(fastq_2) ]) + } else { + meta.single_end = true + tuple(meta, [ file(fastq_1) ]) + } } kraken_database = select_kraken_database(params.database, params.kraken2_db) @@ -173,7 +190,7 @@ workflow SpAnce { ch_csvs = csv_files.map{ meta, topN -> topN }.collect().map{ - topN -> [ [id:"merged_topN"], topN] + topN -> [ [id:"merged_topN", irida_id: "sample"], topN] } CSVTK_CONCAT ( From 25984f7047180aeb95f0d33f3d2457a25ce7fb63 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 7 Aug 2024 15:37:15 -0400 Subject: [PATCH 02/15] Updated processes to ensure CSVTK generates files with a 'sample_id' column based on 'sample' (meta.irida_id) for proper nf-iridanext plugin metadata functionality. --- modules/local/topN/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/topN/main.nf b/modules/local/topN/main.nf index 973b59f..ddab667 100644 --- a/modules/local/topN/main.nf +++ b/modules/local/topN/main.nf @@ -38,7 +38,7 @@ process TOP_N { ${abundances} \\ ${args} \\ -n ${top_n} \\ - -s ${meta.id} \\ + -s ${meta.irida_id} \\ > ${meta.id}_${taxonomic_level}_top_${top_n}.csv cat <<-END_VERSIONS > versions.yml From 822e93d6758f2e9e1b0ce037cf7810fd7415bb7f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 7 Aug 2024 15:42:41 -0400 Subject: [PATCH 03/15] Fixed liniting and EC issues --- .github/workflows/linting.yml | 19 +++++++++---------- .github/workflows/linting_comment.yml | 2 +- workflows/speciesabundance.nf | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 073e187..1fcafe8 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,13 +14,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 - cache: "pip" + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit @@ -32,14 +31,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - name: Install dependencies @@ -60,7 +59,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875..40acc23 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/workflows/speciesabundance.nf b/workflows/speciesabundance.nf index 42974f7..859e884 100644 --- a/workflows/speciesabundance.nf +++ b/workflows/speciesabundance.nf @@ -69,7 +69,7 @@ workflow SpAnce { ch_versions = Channel.empty() // Track processed IDs - def processedIDs = [] as Set + def processedIDs = [] as Set // Create a new channel of metadata from a sample sheet // NB: `input` corresponds to `params.input` and associated sample sheet schema From 03933e89579f39978b64d56338aeb2dfccef8bff Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 7 Aug 2024 16:52:25 -0400 Subject: [PATCH 04/15] Updated CI testing to incorporate sample_name. --- tests/data/error_samplesheet.csv | 10 ++-- tests/data/fail_samplesheet.csv | 4 +- tests/data/test_iridanext.output.json | 66 +++++++++++----------- tests/main.nf.test | 80 +++++++++++++-------------- 4 files changed, 80 insertions(+), 80 deletions(-) diff --git a/tests/data/error_samplesheet.csv b/tests/data/error_samplesheet.csv index d805928..247c134 100644 --- a/tests/data/error_samplesheet.csv +++ b/tests/data/error_samplesheet.csv @@ -1,5 +1,5 @@ -sample,fastq_1,fastq_2 -SAMPLE1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz -SAMPLE2,https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/tests/data/fastq/test-kraken_R1_001.fastq.gz,https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/tests/data/fastq/test-kraken_R2_001.fastq.gz -SAMPLE3,https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/tests/data/fastq/test-bracken_R1_001.fastq.gz,https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/tests/data/fastq/test-bracken_R2_001.fastq.gz -SAMPLE4,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz +sample,sample_name,fastq_1,fastq_2 +SAMPLE1,A1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz +SAMPLE2,B2,https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/tests/data/fastq/test-kraken_R1_001.fastq.gz,https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/tests/data/fastq/test-kraken_R2_001.fastq.gz +SAMPLE3,C3,https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/tests/data/fastq/test-bracken_R1_001.fastq.gz,https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/tests/data/fastq/test-bracken_R2_001.fastq.gz +SAMPLE4,D4,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz diff --git a/tests/data/fail_samplesheet.csv b/tests/data/fail_samplesheet.csv index e0fe2ba..f34b732 100644 --- a/tests/data/fail_samplesheet.csv +++ b/tests/data/fail_samplesheet.csv @@ -1,2 +1,2 @@ -sample,fastq_1,fastq_2 -SAMPLE1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz +sample,sample_name,fastq_1,fastq_2 +SAMPLE1,A1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz diff --git a/tests/data/test_iridanext.output.json b/tests/data/test_iridanext.output.json index b588dda..b7f8db4 100644 --- a/tests/data/test_iridanext.output.json +++ b/tests/data/test_iridanext.output.json @@ -8,51 +8,51 @@ "samples": { "SAMPLE2": [ { - "path": "krona/SAMPLE2.krona.html" + "path": "krona/B2.krona.html" }, { - "path": "adjust/SAMPLE2_S_bracken_abundances.csv" + "path": "adjust/B2_S_bracken_abundances.csv" }, { - "path": "fastp/SAMPLE2.fastp.html" + "path": "fastp/B2.fastp.html" } ], - "SAMPLE1": [ + "SAMPLE3": [ { - "path": "krona/SAMPLE1.krona.html" + "path": "krona/B2_SAMPLE3.krona.html" }, { - "path": "adjust/SAMPLE1_S_bracken_abundances.csv" + "path": "adjust/B2_SAMPLE3_S_bracken_abundances.csv" }, { - "path": "fastp/SAMPLE1.fastp.html" + "path": "fastp/B2_SAMPLE3.fastp.html" } ], - "SAMPLE3": [ + "SAMPLE1": [ { - "path": "krona/SAMPLE3.krona.html" + "path": "krona/A1_sample.krona.html" }, { - "path": "adjust/SAMPLE3_S_bracken_abundances.csv" + "path": "adjust/A1_sample_S_bracken_abundances.csv" }, { - "path": "fastp/SAMPLE3.fastp.html" + "path": "fastp/A1_sample.fastp.html" } ] } }, "metadata": { "samples": { - "SAMPLE3": { + "SAMPLE2": { "taxonomy_level": "S", - "abundance_1_name": "Bacteroides fragilis", - "abundance_1_ncbi_taxonomy_id": "817", - "abundance_1_num_assigned_reads": "28877", - "abundance_1_fraction_total_reads": "0.577702", - "abundance_2_name": "Escherichia coli", - "abundance_2_ncbi_taxonomy_id": "562", - "abundance_2_num_assigned_reads": "21065", - "abundance_2_fraction_total_reads": "0.421418", + "abundance_1_name": "Escherichia coli", + "abundance_1_ncbi_taxonomy_id": "562", + "abundance_1_num_assigned_reads": "631", + "abundance_1_fraction_total_reads": "0.025253", + "abundance_2_name": "Bacteroides fragilis", + "abundance_2_ncbi_taxonomy_id": "817", + "abundance_2_num_assigned_reads": "22", + "abundance_2_fraction_total_reads": "0.00088", "abundance_3_name": "", "abundance_3_ncbi_taxonomy_id": "", "abundance_3_num_assigned_reads": "", @@ -67,19 +67,19 @@ "abundance_5_fraction_total_reads": "", "unclassified_name": "unclassified", "unclassified_ncbi_taxonomy_id": "0", - "unclassified_num_assigned_reads": "44", - "unclassified_fraction_total_reads": "0.00088" + "unclassified_num_assigned_reads": "24334", + "unclassified_fraction_total_reads": "0.973866" }, - "SAMPLE2": { + "SAMPLE3": { "taxonomy_level": "S", - "abundance_1_name": "Escherichia coli", - "abundance_1_ncbi_taxonomy_id": "562", - "abundance_1_num_assigned_reads": "631", - "abundance_1_fraction_total_reads": "0.025253", - "abundance_2_name": "Bacteroides fragilis", - "abundance_2_ncbi_taxonomy_id": "817", - "abundance_2_num_assigned_reads": "22", - "abundance_2_fraction_total_reads": "0.00088", + "abundance_1_name": "Bacteroides fragilis", + "abundance_1_ncbi_taxonomy_id": "817", + "abundance_1_num_assigned_reads": "28877", + "abundance_1_fraction_total_reads": "0.577702", + "abundance_2_name": "Escherichia coli", + "abundance_2_ncbi_taxonomy_id": "562", + "abundance_2_num_assigned_reads": "21065", + "abundance_2_fraction_total_reads": "0.421418", "abundance_3_name": "", "abundance_3_ncbi_taxonomy_id": "", "abundance_3_num_assigned_reads": "", @@ -94,8 +94,8 @@ "abundance_5_fraction_total_reads": "", "unclassified_name": "unclassified", "unclassified_ncbi_taxonomy_id": "0", - "unclassified_num_assigned_reads": "24334", - "unclassified_fraction_total_reads": "0.973866" + "unclassified_num_assigned_reads": "44", + "unclassified_fraction_total_reads": "0.00088" }, "SAMPLE1": { "taxonomy_level": "S", diff --git a/tests/main.nf.test b/tests/main.nf.test index df6adfb..8a277bc 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -8,7 +8,7 @@ nextflow_pipeline { when { params { - input = "https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/assets/samplesheet.csv" + input = "$projectDir/assets/samplesheet.csv" database = "$projectDir/tests/data/minidb" outdir = "results" } @@ -19,12 +19,12 @@ nextflow_pipeline { assert path("$launchDir/results").exists() // check FASTP_TRIM JSON outputs from paired and single reads - def fastp_JSON_paired = path("$launchDir/results/fastp/SAMPLE1.fastp.json").json + def fastp_JSON_paired = path("$launchDir/results/fastp/A1_sample.fastp.json").json assert fastp_JSON_paired.summary.sequencing == "paired end (126 cycles + 126 cycles)" assert fastp_JSON_paired.filtering_result.passed_filter_reads == 99594 - def fastp_JSON_single = path("$launchDir/results/fastp/SAMPLE3.fastp.json").json + def fastp_JSON_single = path("$launchDir/results/fastp/B2_SAMPLE3.fastp.json").json assert fastp_JSON_single.summary.sequencing == "single end (126 cycles)" assert fastp_JSON_single.filtering_result.passed_filter_reads == 49986 @@ -33,7 +33,7 @@ nextflow_pipeline { def lines = [] // KRAKEN2 outputs - lines = path("$launchDir/results/kraken2/SAMPLE2_kraken2_report.txt").text + lines = path("$launchDir/results/kraken2/B2_kraken2_report.txt").text assert lines.contains("97.38 24334 24334 U 0 unclassified") assert lines.contains(" 2.62 655 2 D 2 Bacteria") @@ -43,34 +43,34 @@ nextflow_pipeline { assert lines.contains(" 0.09 22 22 S1 295405 Bacteroides fragilis YCH46") // BRACKEN outputs - lines = path("$launchDir/results/bracken/SAMPLE2_S_bracken.txt").text + lines = path("$launchDir/results/bracken/B2_S_bracken.txt").text assert lines.contains("100.00 653 0 D 2 Bacteria") assert lines.contains("96.63 631 631 S 562 Escherichia coli") assert lines.contains("3.37 22 22 S 817 Bacteroides fragilis") - lines = path("$launchDir/results/bracken/SAMPLE2_S_bracken_abundances_unsorted.tsv").text + lines = path("$launchDir/results/bracken/B2_S_bracken_abundances_unsorted.tsv").text assert lines.contains("Escherichia coli 562 S 631 0 631 0.96631") assert lines.contains("Bacteroides fragilis 817 S 22 0 22 0.03369") // ADJUST_BRACKEN outputs - lines = path("$launchDir/results/adjust/SAMPLE2_S_bracken_abundances.csv").readLines() + lines = path("$launchDir/results/adjust/B2_S_bracken_abundances.csv").readLines() - assert lines.contains("SAMPLE2,unclassified,0,U,24334,0,24334,0.973866") - assert lines.contains("SAMPLE2,Escherichia coli,562,S,631,0,631,0.025253") - assert lines.contains("SAMPLE2,Bacteroides fragilis,817,S,22,0,22,0.00088") + assert lines.contains("B2,unclassified,0,U,24334,0,24334,0.973866") + assert lines.contains("B2,Escherichia coli,562,S,631,0,631,0.025253") + assert lines.contains("B2,Bacteroides fragilis,817,S,22,0,22,0.00088") - lines = path("$launchDir/results/adjust/SAMPLE2_S_adjusted_report.txt").text + lines = path("$launchDir/results/adjust/B2_S_adjusted_report.txt").text assert lines.contains("97.39 24334 24334 U 0 unclassified") assert lines.contains("2.53 631 631 S 562 Escherichia coli") assert lines.contains("0.09 22 22 S 817 Bacteroides fragilis") // check that KRONA html files exist: - assert path("$launchDir/results/krona/SAMPLE1.krona.html").exists() - assert path("$launchDir/results/krona/SAMPLE2.krona.html").exists() - assert path("$launchDir/results/krona/SAMPLE3.krona.html").exists() + assert path("$launchDir/results/krona/A1_sample.krona.html").exists() + assert path("$launchDir/results/krona/B2.krona.html").exists() + assert path("$launchDir/results/krona/B2_SAMPLE3.krona.html").exists() // check failure_report assert path("$launchDir/results/failure/failures_report.csv").exists() @@ -94,15 +94,15 @@ nextflow_pipeline { assert iridanext_global.findAll {it.path == "failure/failures_report.csv" }.size() == 1 // samples output files - assert iridanext_samples.SAMPLE2.findAll { it.path == "adjust/SAMPLE2_S_bracken_abundances.csv" }.size() == 1 - assert iridanext_samples.SAMPLE2.findAll { it.path == "krona/SAMPLE2.krona.html" }.size() == 1 - assert iridanext_samples.SAMPLE2.findAll { it.path == "fastp/SAMPLE2.fastp.html" }.size() == 1 - assert iridanext_samples.SAMPLE3.findAll { it.path == "adjust/SAMPLE3_S_bracken_abundances.csv" }.size() == 1 - assert iridanext_samples.SAMPLE3.findAll { it.path == "krona/SAMPLE3.krona.html" }.size() == 1 - assert iridanext_samples.SAMPLE3.findAll { it.path == "fastp/SAMPLE3.fastp.html" }.size() == 1 - assert iridanext_samples.SAMPLE1.findAll { it.path == "adjust/SAMPLE1_S_bracken_abundances.csv" }.size() == 1 - assert iridanext_samples.SAMPLE1.findAll { it.path == "krona/SAMPLE1.krona.html" }.size() == 1 - assert iridanext_samples.SAMPLE1.findAll { it.path == "fastp/SAMPLE1.fastp.html" }.size() == 1 + assert iridanext_samples.SAMPLE2.findAll { it.path == "adjust/B2_S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE2.findAll { it.path == "krona/B2.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE2.findAll { it.path == "fastp/B2.fastp.html" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "adjust/B2_SAMPLE3_S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "krona/B2_SAMPLE3.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "fastp/B2_SAMPLE3.fastp.html" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "adjust/A1_sample_S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "krona/A1_sample.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "fastp/A1_sample.fastp.html" }.size() == 1 // output metadata assert iridanext_metadata.SAMPLE2."abundance_1_name" == "Escherichia coli" @@ -141,7 +141,7 @@ nextflow_pipeline { when { params { - input = "https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/assets/samplesheet.csv" + input = "$projectDir/assets/samplesheet.csv" kraken2_db = "$projectDir/tests/data/kraken2database" bracken_db = "$projectDir/tests/data/brackendatabase" outdir = "results" @@ -159,7 +159,7 @@ nextflow_pipeline { when { params { - input = "https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/assets/samplesheet.csv" + input = "$projectDir/assets/samplesheet.csv" outdir = "results" } } @@ -175,7 +175,7 @@ nextflow_pipeline { when { params { - input = "https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/assets/samplesheet.csv" + input = "$projectDir/assets/samplesheet.csv" kraken2_db = "$projectDir/tests/data/kraken2database" outdir = "results" } @@ -192,7 +192,7 @@ nextflow_pipeline { when { params { - input = "https://raw.githubusercontent.com/phac-nml/speciesabundance/dev/assets/samplesheet.csv" + input = "$projectDir/assets/samplesheet.csv" kraken2_db = "$projectDir/tests/data/kraken2database" bracken_db = "$projectDir/tests/data/kraken2database" outdir = "results" @@ -246,21 +246,21 @@ nextflow_pipeline { assert lines.size() == 4 assert lines.contains("sample,module,error_message") - assert lines.contains("SAMPLE1,FASTP,The input FASTQ file(s) might exhibit either a mismatch in PAIRED files; corruption in one or both SINGLE/PAIRED file(s); or file(s) may not exist in PATH provided by input samplesheet") - assert lines.contains("SAMPLE2,KRAKEN2,The reads may not have passed the quality control and trimming process OR the database directory may be missing required KRAKEN2 files") - assert lines.contains("SAMPLE3,BRACKEN,The reads may have failed to classify against the selected Kraken2 database OR the database directory may be missing the Bracken distribution files") + assert lines.contains("A1,FASTP,The input FASTQ file(s) might exhibit either a mismatch in PAIRED files; corruption in one or both SINGLE/PAIRED file(s); or file(s) may not exist in PATH provided by input samplesheet") + assert lines.contains("B2,KRAKEN2,The reads may not have passed the quality control and trimming process OR the database directory may be missing required KRAKEN2 files") + assert lines.contains("C3,BRACKEN,The reads may have failed to classify against the selected Kraken2 database OR the database directory may be missing the Bracken distribution files") // individual sample output checks across the pipeline - assert path("$launchDir/results/fastp/SAMPLE1.fastp.html").exists() == false - assert path("$launchDir/results/fastp/SAMPLE2.fastp.html").exists() - assert path("$launchDir/results/kraken2/SAMPLE2_kraken2_report.txt").exists() == false - assert path("$launchDir/results/kraken2/SAMPLE3_kraken2_report.txt").exists() - assert path("$launchDir/results/bracken/SAMPLE3_S_bracken.txt").exists() == false - assert path("$launchDir/results/bracken/SAMPLE4_S_bracken.txt").exists() - assert path("$launchDir/results/adjust/SAMPLE4_S_adjusted_report.txt").exists() - assert path("$launchDir/results/krona/SAMPLE4.krona.html").exists() - assert path("$launchDir/results/top/SAMPLE4_S_top_5.csv").exists() + assert path("$launchDir/results/fastp/A1.fastp.html").exists() == false + assert path("$launchDir/results/fastp/B2.fastp.html").exists() + assert path("$launchDir/results/kraken2/B2_kraken2_report.txt").exists() == false + assert path("$launchDir/results/kraken2/C3_kraken2_report.txt").exists() + assert path("$launchDir/results/bracken/C3_S_bracken.txt").exists() == false + assert path("$launchDir/results/bracken/D4_S_bracken.txt").exists() + assert path("$launchDir/results/adjust/D4_S_adjusted_report.txt").exists() + assert path("$launchDir/results/krona/D4.krona.html").exists() + assert path("$launchDir/results/top/D4_S_top_5.csv").exists() assert path("$launchDir/results/csvtk/merged_topN.csv").exists() lines = path("$launchDir/results/csvtk/merged_topN.csv").readLines() @@ -296,7 +296,7 @@ nextflow_pipeline { assert lines.size() == 2 assert lines.contains("sample,module,error_message") - assert lines.contains("SAMPLE1,FASTP,The input FASTQ file(s) might exhibit either a mismatch in PAIRED files; corruption in one or both SINGLE/PAIRED file(s); or file(s) may not exist in PATH provided by input samplesheet") + assert lines.contains("A1,FASTP,The input FASTQ file(s) might exhibit either a mismatch in PAIRED files; corruption in one or both SINGLE/PAIRED file(s); or file(s) may not exist in PATH provided by input samplesheet") } } From e4a1ae0e0b65c943ee2eb1cfebb477680688d81c Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 8 Aug 2024 14:54:02 -0400 Subject: [PATCH 05/15] Update workflow logic to et meta.id to meta.irida_if if sample_name is not provided in the samplesheet --- assets/samplesheet.csv | 1 + assets/schema_input.json | 2 +- workflows/speciesabundance.nf | 7 +++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index b8c2da1..22d90f4 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -2,3 +2,4 @@ sample,sample_name,fastq_1,fastq_2 SAMPLE1,A1 sample,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz SAMPLE2,B2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz SAMPLE3,B2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, +SAMPLE4,,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, diff --git a/assets/schema_input.json b/assets/schema_input.json index 6792cf4..2d816ae 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -38,6 +38,6 @@ ] } }, - "required": ["sample", "sample_name", "fastq_1"] + "required": ["sample", "fastq_1"] } } diff --git a/workflows/speciesabundance.nf b/workflows/speciesabundance.nf index 859e884..b06c1bd 100644 --- a/workflows/speciesabundance.nf +++ b/workflows/speciesabundance.nf @@ -75,8 +75,11 @@ workflow SpAnce { // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") .map { meta, fastq_1, fastq_2 -> - // Replace spaces in 'id' with underscores - if (meta.id) { + // Set meta.id to meta.irida_if if sample_name is not provided in the samplesheet + if (!meta.id) { + meta.id = meta.irida_id + } else { + // Replace spaces in 'id' with underscores meta.id = meta.id.replaceAll(/\s+/, '_') } // Ensure ID is unique by appending meta.irida_id if needed From 94bc34c73845c4a1fbeace8db6336631a9aa0069 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 8 Aug 2024 15:08:34 -0400 Subject: [PATCH 06/15] Enhanced test coverage by adding a scenario where sample_name is omitted in the input samplesheet. --- tests/data/test_iridanext.output.json | 80 ++++++++++++++++++++------- tests/main.nf.test | 9 +++ 2 files changed, 68 insertions(+), 21 deletions(-) diff --git a/tests/data/test_iridanext.output.json b/tests/data/test_iridanext.output.json index b7f8db4..d0a36dc 100644 --- a/tests/data/test_iridanext.output.json +++ b/tests/data/test_iridanext.output.json @@ -6,26 +6,37 @@ } ], "samples": { - "SAMPLE2": [ + "SAMPLE3": [ { - "path": "krona/B2.krona.html" + "path": "krona/B2_SAMPLE3.krona.html" }, { - "path": "adjust/B2_S_bracken_abundances.csv" + "path": "adjust/B2_SAMPLE3_S_bracken_abundances.csv" }, { - "path": "fastp/B2.fastp.html" + "path": "fastp/B2_SAMPLE3.fastp.html" } ], - "SAMPLE3": [ + "SAMPLE4": [ { - "path": "krona/B2_SAMPLE3.krona.html" + "path": "krona/SAMPLE4.krona.html" }, { - "path": "adjust/B2_SAMPLE3_S_bracken_abundances.csv" + "path": "adjust/SAMPLE4_S_bracken_abundances.csv" }, { - "path": "fastp/B2_SAMPLE3.fastp.html" + "path": "fastp/SAMPLE4.fastp.html" + } + ], + "SAMPLE2": [ + { + "path": "krona/B2.krona.html" + }, + { + "path": "adjust/B2_S_bracken_abundances.csv" + }, + { + "path": "fastp/B2.fastp.html" } ], "SAMPLE1": [ @@ -43,16 +54,16 @@ }, "metadata": { "samples": { - "SAMPLE2": { + "SAMPLE3": { "taxonomy_level": "S", - "abundance_1_name": "Escherichia coli", - "abundance_1_ncbi_taxonomy_id": "562", - "abundance_1_num_assigned_reads": "631", - "abundance_1_fraction_total_reads": "0.025253", - "abundance_2_name": "Bacteroides fragilis", - "abundance_2_ncbi_taxonomy_id": "817", - "abundance_2_num_assigned_reads": "22", - "abundance_2_fraction_total_reads": "0.00088", + "abundance_1_name": "Bacteroides fragilis", + "abundance_1_ncbi_taxonomy_id": "817", + "abundance_1_num_assigned_reads": "28877", + "abundance_1_fraction_total_reads": "0.577702", + "abundance_2_name": "Escherichia coli", + "abundance_2_ncbi_taxonomy_id": "562", + "abundance_2_num_assigned_reads": "21065", + "abundance_2_fraction_total_reads": "0.421418", "abundance_3_name": "", "abundance_3_ncbi_taxonomy_id": "", "abundance_3_num_assigned_reads": "", @@ -67,10 +78,10 @@ "abundance_5_fraction_total_reads": "", "unclassified_name": "unclassified", "unclassified_ncbi_taxonomy_id": "0", - "unclassified_num_assigned_reads": "24334", - "unclassified_fraction_total_reads": "0.973866" + "unclassified_num_assigned_reads": "44", + "unclassified_fraction_total_reads": "0.00088" }, - "SAMPLE3": { + "SAMPLE4": { "taxonomy_level": "S", "abundance_1_name": "Bacteroides fragilis", "abundance_1_ncbi_taxonomy_id": "817", @@ -97,6 +108,33 @@ "unclassified_num_assigned_reads": "44", "unclassified_fraction_total_reads": "0.00088" }, + "SAMPLE2": { + "taxonomy_level": "S", + "abundance_1_name": "Escherichia coli", + "abundance_1_ncbi_taxonomy_id": "562", + "abundance_1_num_assigned_reads": "631", + "abundance_1_fraction_total_reads": "0.025253", + "abundance_2_name": "Bacteroides fragilis", + "abundance_2_ncbi_taxonomy_id": "817", + "abundance_2_num_assigned_reads": "22", + "abundance_2_fraction_total_reads": "0.00088", + "abundance_3_name": "", + "abundance_3_ncbi_taxonomy_id": "", + "abundance_3_num_assigned_reads": "", + "abundance_3_fraction_total_reads": "", + "abundance_4_name": "", + "abundance_4_ncbi_taxonomy_id": "", + "abundance_4_num_assigned_reads": "", + "abundance_4_fraction_total_reads": "", + "abundance_5_name": "", + "abundance_5_ncbi_taxonomy_id": "", + "abundance_5_num_assigned_reads": "", + "abundance_5_fraction_total_reads": "", + "unclassified_name": "unclassified", + "unclassified_ncbi_taxonomy_id": "0", + "unclassified_num_assigned_reads": "24334", + "unclassified_fraction_total_reads": "0.973866" + }, "SAMPLE1": { "taxonomy_level": "S", "abundance_1_name": "Bacteroides fragilis", @@ -126,4 +164,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/main.nf.test b/tests/main.nf.test index 8a277bc..d6a034b 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -71,6 +71,7 @@ nextflow_pipeline { assert path("$launchDir/results/krona/A1_sample.krona.html").exists() assert path("$launchDir/results/krona/B2.krona.html").exists() assert path("$launchDir/results/krona/B2_SAMPLE3.krona.html").exists() + assert path("$launchDir/results/krona/SAMPLE4.krona.html").exists() // check failure_report assert path("$launchDir/results/failure/failures_report.csv").exists() @@ -103,6 +104,9 @@ nextflow_pipeline { assert iridanext_samples.SAMPLE1.findAll { it.path == "adjust/A1_sample_S_bracken_abundances.csv" }.size() == 1 assert iridanext_samples.SAMPLE1.findAll { it.path == "krona/A1_sample.krona.html" }.size() == 1 assert iridanext_samples.SAMPLE1.findAll { it.path == "fastp/A1_sample.fastp.html" }.size() == 1 + assert iridanext_samples.SAMPLE4.findAll { it.path == "adjust/SAMPLE4_S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE4.findAll { it.path == "krona/SAMPLE4.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE4.findAll { it.path == "fastp/SAMPLE4.fastp.html" }.size() == 1 // output metadata assert iridanext_metadata.SAMPLE2."abundance_1_name" == "Escherichia coli" @@ -129,6 +133,11 @@ nextflow_pipeline { assert iridanext_metadata.SAMPLE3."abundance_1_num_assigned_reads" == "28877" assert iridanext_metadata.SAMPLE3."abundance_1_fraction_total_reads" == "0.577702" + assert iridanext_metadata.SAMPLE4."abundance_1_name" == "Bacteroides fragilis" + assert iridanext_metadata.SAMPLE4."abundance_1_ncbi_taxonomy_id" == "817" + assert iridanext_metadata.SAMPLE4."abundance_1_num_assigned_reads" == "28877" + assert iridanext_metadata.SAMPLE4."abundance_1_fraction_total_reads" == "0.577702" + assert iridanext_metadata.SAMPLE1."abundance_1_name" == "Bacteroides fragilis" assert iridanext_metadata.SAMPLE1."abundance_1_ncbi_taxonomy_id" == "817" assert iridanext_metadata.SAMPLE1."abundance_1_num_assigned_reads" == "28799" From b96f2bc8fc5fedab14f447be6e8219f30de2d113 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 8 Aug 2024 15:10:39 -0400 Subject: [PATCH 07/15] Fixed linting issue --- tests/data/test_iridanext.output.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/test_iridanext.output.json b/tests/data/test_iridanext.output.json index d0a36dc..db38f2a 100644 --- a/tests/data/test_iridanext.output.json +++ b/tests/data/test_iridanext.output.json @@ -164,4 +164,4 @@ } } } -} \ No newline at end of file +} From 3a081bb0c6174f9c1e62e5ef94c8f8ce89b5c897 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 12 Aug 2024 11:38:22 -0400 Subject: [PATCH 08/15] Fixed typo --- workflows/speciesabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/speciesabundance.nf b/workflows/speciesabundance.nf index b06c1bd..8874b64 100644 --- a/workflows/speciesabundance.nf +++ b/workflows/speciesabundance.nf @@ -75,7 +75,7 @@ workflow SpAnce { // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") .map { meta, fastq_1, fastq_2 -> - // Set meta.id to meta.irida_if if sample_name is not provided in the samplesheet + // Set meta.id to meta.irida_id if sample_name is not provided in the samplesheet if (!meta.id) { meta.id = meta.irida_id } else { From cc6a23d7fba904b92ee01f29bddfee0aec760666 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 12 Aug 2024 13:51:31 -0400 Subject: [PATCH 09/15] Replace non-alphanumeric characters in sample IDs with underscores --- workflows/speciesabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/speciesabundance.nf b/workflows/speciesabundance.nf index 8874b64..80f013a 100644 --- a/workflows/speciesabundance.nf +++ b/workflows/speciesabundance.nf @@ -80,7 +80,7 @@ workflow SpAnce { meta.id = meta.irida_id } else { // Replace spaces in 'id' with underscores - meta.id = meta.id.replaceAll(/\s+/, '_') + meta.id = meta.id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') } // Ensure ID is unique by appending meta.irida_id if needed while (processedIDs.contains(meta.id)) { From 5dd474e717ea09f6f93c08486d5833b1245d6519 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 12 Aug 2024 13:55:59 -0400 Subject: [PATCH 10/15] Updated main.nf.test to include tests for various sample_name scenarios --- assets/samplesheet.csv | 5 +- .../samplename.test_iridanext.output.json | 167 ++++++++++++++++++ tests/data/samplename_samplesheet.csv | 5 + tests/data/test_iridanext.output.json | 78 +++----- tests/main.nf.test | 155 ++++++++++++++-- 5 files changed, 338 insertions(+), 72 deletions(-) create mode 100644 tests/data/samplename.test_iridanext.output.json create mode 100644 tests/data/samplename_samplesheet.csv diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 22d90f4..46a711c 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,5 +1,4 @@ sample,sample_name,fastq_1,fastq_2 -SAMPLE1,A1 sample,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz +SAMPLE1,A1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz SAMPLE2,B2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz -SAMPLE3,B2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, -SAMPLE4,,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, +SAMPLE3,C3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, diff --git a/tests/data/samplename.test_iridanext.output.json b/tests/data/samplename.test_iridanext.output.json new file mode 100644 index 0000000..8437b18 --- /dev/null +++ b/tests/data/samplename.test_iridanext.output.json @@ -0,0 +1,167 @@ +{ + "files": { + "global": [ + { + "path": "failure/failures_report.csv" + } + ], + "samples": { + "SAMPLE2": [ + { + "path": "krona/B2_.krona.html" + }, + { + "path": "adjust/B2__S_bracken_abundances.csv" + }, + { + "path": "fastp/B2_.fastp.html" + } + ], + "SAMPLE3": [ + { + "path": "krona/B2__SAMPLE3.krona.html" + }, + { + "path": "adjust/B2__SAMPLE3_S_bracken_abundances.csv" + }, + { + "path": "fastp/B2__SAMPLE3.fastp.html" + } + ], + "SAMPLE4": [ + { + "path": "krona/SAMPLE4.krona.html" + }, + { + "path": "adjust/SAMPLE4_S_bracken_abundances.csv" + }, + { + "path": "fastp/SAMPLE4.fastp.html" + } + ], + "SAMPLE1": [ + { + "path": "krona/A1__sample1.krona.html" + }, + { + "path": "adjust/A1__sample1_S_bracken_abundances.csv" + }, + { + "path": "fastp/A1__sample1.fastp.html" + } + ] + } + }, + "metadata": { + "samples": { + "SAMPLE4": { + "taxonomy_level": "S", + "abundance_1_name": "Bacteroides fragilis", + "abundance_1_ncbi_taxonomy_id": "817", + "abundance_1_num_assigned_reads": "28877", + "abundance_1_fraction_total_reads": "0.577702", + "abundance_2_name": "Escherichia coli", + "abundance_2_ncbi_taxonomy_id": "562", + "abundance_2_num_assigned_reads": "21065", + "abundance_2_fraction_total_reads": "0.421418", + "abundance_3_name": "", + "abundance_3_ncbi_taxonomy_id": "", + "abundance_3_num_assigned_reads": "", + "abundance_3_fraction_total_reads": "", + "abundance_4_name": "", + "abundance_4_ncbi_taxonomy_id": "", + "abundance_4_num_assigned_reads": "", + "abundance_4_fraction_total_reads": "", + "abundance_5_name": "", + "abundance_5_ncbi_taxonomy_id": "", + "abundance_5_num_assigned_reads": "", + "abundance_5_fraction_total_reads": "", + "unclassified_name": "unclassified", + "unclassified_ncbi_taxonomy_id": "0", + "unclassified_num_assigned_reads": "44", + "unclassified_fraction_total_reads": "0.00088" + }, + "SAMPLE2": { + "taxonomy_level": "S", + "abundance_1_name": "Escherichia coli", + "abundance_1_ncbi_taxonomy_id": "562", + "abundance_1_num_assigned_reads": "631", + "abundance_1_fraction_total_reads": "0.025253", + "abundance_2_name": "Bacteroides fragilis", + "abundance_2_ncbi_taxonomy_id": "817", + "abundance_2_num_assigned_reads": "22", + "abundance_2_fraction_total_reads": "0.00088", + "abundance_3_name": "", + "abundance_3_ncbi_taxonomy_id": "", + "abundance_3_num_assigned_reads": "", + "abundance_3_fraction_total_reads": "", + "abundance_4_name": "", + "abundance_4_ncbi_taxonomy_id": "", + "abundance_4_num_assigned_reads": "", + "abundance_4_fraction_total_reads": "", + "abundance_5_name": "", + "abundance_5_ncbi_taxonomy_id": "", + "abundance_5_num_assigned_reads": "", + "abundance_5_fraction_total_reads": "", + "unclassified_name": "unclassified", + "unclassified_ncbi_taxonomy_id": "0", + "unclassified_num_assigned_reads": "24334", + "unclassified_fraction_total_reads": "0.973866" + }, + "SAMPLE3": { + "taxonomy_level": "S", + "abundance_1_name": "Bacteroides fragilis", + "abundance_1_ncbi_taxonomy_id": "817", + "abundance_1_num_assigned_reads": "28877", + "abundance_1_fraction_total_reads": "0.577702", + "abundance_2_name": "Escherichia coli", + "abundance_2_ncbi_taxonomy_id": "562", + "abundance_2_num_assigned_reads": "21065", + "abundance_2_fraction_total_reads": "0.421418", + "abundance_3_name": "", + "abundance_3_ncbi_taxonomy_id": "", + "abundance_3_num_assigned_reads": "", + "abundance_3_fraction_total_reads": "", + "abundance_4_name": "", + "abundance_4_ncbi_taxonomy_id": "", + "abundance_4_num_assigned_reads": "", + "abundance_4_fraction_total_reads": "", + "abundance_5_name": "", + "abundance_5_ncbi_taxonomy_id": "", + "abundance_5_num_assigned_reads": "", + "abundance_5_fraction_total_reads": "", + "unclassified_name": "unclassified", + "unclassified_ncbi_taxonomy_id": "0", + "unclassified_num_assigned_reads": "44", + "unclassified_fraction_total_reads": "0.00088" + }, + "SAMPLE1": { + "taxonomy_level": "S", + "abundance_1_name": "Bacteroides fragilis", + "abundance_1_ncbi_taxonomy_id": "817", + "abundance_1_num_assigned_reads": "28799", + "abundance_1_fraction_total_reads": "0.578328", + "abundance_2_name": "Escherichia coli", + "abundance_2_ncbi_taxonomy_id": "562", + "abundance_2_num_assigned_reads": "20997", + "abundance_2_fraction_total_reads": "0.421652", + "abundance_3_name": "", + "abundance_3_ncbi_taxonomy_id": "", + "abundance_3_num_assigned_reads": "", + "abundance_3_fraction_total_reads": "", + "abundance_4_name": "", + "abundance_4_ncbi_taxonomy_id": "", + "abundance_4_num_assigned_reads": "", + "abundance_4_fraction_total_reads": "", + "abundance_5_name": "", + "abundance_5_ncbi_taxonomy_id": "", + "abundance_5_num_assigned_reads": "", + "abundance_5_fraction_total_reads": "", + "unclassified_name": "unclassified", + "unclassified_ncbi_taxonomy_id": "0", + "unclassified_num_assigned_reads": "1", + "unclassified_fraction_total_reads": "2e-05" + } + } + } +} diff --git a/tests/data/samplename_samplesheet.csv b/tests/data/samplename_samplesheet.csv new file mode 100644 index 0000000..45cf612 --- /dev/null +++ b/tests/data/samplename_samplesheet.csv @@ -0,0 +1,5 @@ +sample,sample_name,fastq_1,fastq_2 +SAMPLE1,A1@ sample1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz +SAMPLE2,B2@,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz +SAMPLE3,B2#,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, +SAMPLE4,,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, diff --git a/tests/data/test_iridanext.output.json b/tests/data/test_iridanext.output.json index db38f2a..906011e 100644 --- a/tests/data/test_iridanext.output.json +++ b/tests/data/test_iridanext.output.json @@ -6,26 +6,15 @@ } ], "samples": { - "SAMPLE3": [ - { - "path": "krona/B2_SAMPLE3.krona.html" - }, - { - "path": "adjust/B2_SAMPLE3_S_bracken_abundances.csv" - }, - { - "path": "fastp/B2_SAMPLE3.fastp.html" - } - ], - "SAMPLE4": [ + "SAMPLE1": [ { - "path": "krona/SAMPLE4.krona.html" + "path": "krona/A1.krona.html" }, { - "path": "adjust/SAMPLE4_S_bracken_abundances.csv" + "path": "adjust/A1_S_bracken_abundances.csv" }, { - "path": "fastp/SAMPLE4.fastp.html" + "path": "fastp/A1.fastp.html" } ], "SAMPLE2": [ @@ -39,31 +28,31 @@ "path": "fastp/B2.fastp.html" } ], - "SAMPLE1": [ + "SAMPLE3": [ { - "path": "krona/A1_sample.krona.html" + "path": "krona/C3.krona.html" }, { - "path": "adjust/A1_sample_S_bracken_abundances.csv" + "path": "adjust/C3_S_bracken_abundances.csv" }, { - "path": "fastp/A1_sample.fastp.html" + "path": "fastp/C3.fastp.html" } ] } }, "metadata": { "samples": { - "SAMPLE3": { + "SAMPLE2": { "taxonomy_level": "S", - "abundance_1_name": "Bacteroides fragilis", - "abundance_1_ncbi_taxonomy_id": "817", - "abundance_1_num_assigned_reads": "28877", - "abundance_1_fraction_total_reads": "0.577702", - "abundance_2_name": "Escherichia coli", - "abundance_2_ncbi_taxonomy_id": "562", - "abundance_2_num_assigned_reads": "21065", - "abundance_2_fraction_total_reads": "0.421418", + "abundance_1_name": "Escherichia coli", + "abundance_1_ncbi_taxonomy_id": "562", + "abundance_1_num_assigned_reads": "631", + "abundance_1_fraction_total_reads": "0.025253", + "abundance_2_name": "Bacteroides fragilis", + "abundance_2_ncbi_taxonomy_id": "817", + "abundance_2_num_assigned_reads": "22", + "abundance_2_fraction_total_reads": "0.00088", "abundance_3_name": "", "abundance_3_ncbi_taxonomy_id": "", "abundance_3_num_assigned_reads": "", @@ -78,10 +67,10 @@ "abundance_5_fraction_total_reads": "", "unclassified_name": "unclassified", "unclassified_ncbi_taxonomy_id": "0", - "unclassified_num_assigned_reads": "44", - "unclassified_fraction_total_reads": "0.00088" + "unclassified_num_assigned_reads": "24334", + "unclassified_fraction_total_reads": "0.973866" }, - "SAMPLE4": { + "SAMPLE3": { "taxonomy_level": "S", "abundance_1_name": "Bacteroides fragilis", "abundance_1_ncbi_taxonomy_id": "817", @@ -108,33 +97,6 @@ "unclassified_num_assigned_reads": "44", "unclassified_fraction_total_reads": "0.00088" }, - "SAMPLE2": { - "taxonomy_level": "S", - "abundance_1_name": "Escherichia coli", - "abundance_1_ncbi_taxonomy_id": "562", - "abundance_1_num_assigned_reads": "631", - "abundance_1_fraction_total_reads": "0.025253", - "abundance_2_name": "Bacteroides fragilis", - "abundance_2_ncbi_taxonomy_id": "817", - "abundance_2_num_assigned_reads": "22", - "abundance_2_fraction_total_reads": "0.00088", - "abundance_3_name": "", - "abundance_3_ncbi_taxonomy_id": "", - "abundance_3_num_assigned_reads": "", - "abundance_3_fraction_total_reads": "", - "abundance_4_name": "", - "abundance_4_ncbi_taxonomy_id": "", - "abundance_4_num_assigned_reads": "", - "abundance_4_fraction_total_reads": "", - "abundance_5_name": "", - "abundance_5_ncbi_taxonomy_id": "", - "abundance_5_num_assigned_reads": "", - "abundance_5_fraction_total_reads": "", - "unclassified_name": "unclassified", - "unclassified_ncbi_taxonomy_id": "0", - "unclassified_num_assigned_reads": "24334", - "unclassified_fraction_total_reads": "0.973866" - }, "SAMPLE1": { "taxonomy_level": "S", "abundance_1_name": "Bacteroides fragilis", diff --git a/tests/main.nf.test b/tests/main.nf.test index d6a034b..1dfd657 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -19,12 +19,12 @@ nextflow_pipeline { assert path("$launchDir/results").exists() // check FASTP_TRIM JSON outputs from paired and single reads - def fastp_JSON_paired = path("$launchDir/results/fastp/A1_sample.fastp.json").json + def fastp_JSON_paired = path("$launchDir/results/fastp/A1.fastp.json").json assert fastp_JSON_paired.summary.sequencing == "paired end (126 cycles + 126 cycles)" assert fastp_JSON_paired.filtering_result.passed_filter_reads == 99594 - def fastp_JSON_single = path("$launchDir/results/fastp/B2_SAMPLE3.fastp.json").json + def fastp_JSON_single = path("$launchDir/results/fastp/C3.fastp.json").json assert fastp_JSON_single.summary.sequencing == "single end (126 cycles)" assert fastp_JSON_single.filtering_result.passed_filter_reads == 49986 @@ -68,10 +68,9 @@ nextflow_pipeline { assert lines.contains("0.09 22 22 S 817 Bacteroides fragilis") // check that KRONA html files exist: - assert path("$launchDir/results/krona/A1_sample.krona.html").exists() + assert path("$launchDir/results/krona/A1.krona.html").exists() assert path("$launchDir/results/krona/B2.krona.html").exists() - assert path("$launchDir/results/krona/B2_SAMPLE3.krona.html").exists() - assert path("$launchDir/results/krona/SAMPLE4.krona.html").exists() + assert path("$launchDir/results/krona/C3.krona.html").exists() // check failure_report assert path("$launchDir/results/failure/failures_report.csv").exists() @@ -98,12 +97,146 @@ nextflow_pipeline { assert iridanext_samples.SAMPLE2.findAll { it.path == "adjust/B2_S_bracken_abundances.csv" }.size() == 1 assert iridanext_samples.SAMPLE2.findAll { it.path == "krona/B2.krona.html" }.size() == 1 assert iridanext_samples.SAMPLE2.findAll { it.path == "fastp/B2.fastp.html" }.size() == 1 - assert iridanext_samples.SAMPLE3.findAll { it.path == "adjust/B2_SAMPLE3_S_bracken_abundances.csv" }.size() == 1 - assert iridanext_samples.SAMPLE3.findAll { it.path == "krona/B2_SAMPLE3.krona.html" }.size() == 1 - assert iridanext_samples.SAMPLE3.findAll { it.path == "fastp/B2_SAMPLE3.fastp.html" }.size() == 1 - assert iridanext_samples.SAMPLE1.findAll { it.path == "adjust/A1_sample_S_bracken_abundances.csv" }.size() == 1 - assert iridanext_samples.SAMPLE1.findAll { it.path == "krona/A1_sample.krona.html" }.size() == 1 - assert iridanext_samples.SAMPLE1.findAll { it.path == "fastp/A1_sample.fastp.html" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "adjust/C3_S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "krona/C3.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "fastp/C3.fastp.html" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "adjust/A1_S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "krona/A1.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "fastp/A1.fastp.html" }.size() == 1 + + // output metadata + assert iridanext_metadata.SAMPLE2."abundance_1_name" == "Escherichia coli" + assert iridanext_metadata.SAMPLE2."abundance_1_ncbi_taxonomy_id" == "562" + assert iridanext_metadata.SAMPLE2."abundance_1_num_assigned_reads" == "631" + assert iridanext_metadata.SAMPLE2."abundance_1_fraction_total_reads" == "0.025253" + assert iridanext_metadata.SAMPLE2."abundance_2_name" == "Bacteroides fragilis" + assert iridanext_metadata.SAMPLE2."abundance_2_ncbi_taxonomy_id" == "817" + assert iridanext_metadata.SAMPLE2."abundance_2_num_assigned_reads" == "22" + assert iridanext_metadata.SAMPLE2."abundance_2_fraction_total_reads" == "0.00088" + assert iridanext_metadata.SAMPLE2."abundance_3_name" == "" + assert iridanext_metadata.SAMPLE2."abundance_3_fraction_total_reads" == "" + assert iridanext_metadata.SAMPLE2."abundance_4_name" == "" + assert iridanext_metadata.SAMPLE2."abundance_4_fraction_total_reads" == "" + assert iridanext_metadata.SAMPLE2."abundance_5_name" == "" + assert iridanext_metadata.SAMPLE2."abundance_5_fraction_total_reads" == "" + assert iridanext_metadata.SAMPLE2."unclassified_name" == "unclassified" + assert iridanext_metadata.SAMPLE2."unclassified_ncbi_taxonomy_id" == "0" + assert iridanext_metadata.SAMPLE2."unclassified_num_assigned_reads" == "24334" + assert iridanext_metadata.SAMPLE2."unclassified_fraction_total_reads" == "0.973866" + + assert iridanext_metadata.SAMPLE3."abundance_1_name" == "Bacteroides fragilis" + assert iridanext_metadata.SAMPLE3."abundance_1_ncbi_taxonomy_id" == "817" + assert iridanext_metadata.SAMPLE3."abundance_1_num_assigned_reads" == "28877" + assert iridanext_metadata.SAMPLE3."abundance_1_fraction_total_reads" == "0.577702" + + assert iridanext_metadata.SAMPLE1."abundance_1_name" == "Bacteroides fragilis" + assert iridanext_metadata.SAMPLE1."abundance_1_ncbi_taxonomy_id" == "817" + assert iridanext_metadata.SAMPLE1."abundance_1_num_assigned_reads" == "28799" + assert iridanext_metadata.SAMPLE1."abundance_1_fraction_total_reads" == "0.578328" + } + } + + test("Testing different sample_name scenarios") { + tag "pipeline_success" + + when { + params { + input = "$projectDir/tests/data/samplename_samplesheet.csv" + database = "$projectDir/tests/data/minidb" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // check FASTP_TRIM JSON outputs from paired and single reads + def fastp_JSON_paired = path("$launchDir/results/fastp/A1__sample1.fastp.json").json + + assert fastp_JSON_paired.summary.sequencing == "paired end (126 cycles + 126 cycles)" + assert fastp_JSON_paired.filtering_result.passed_filter_reads == 99594 + + def fastp_JSON_single = path("$launchDir/results/fastp/B2__SAMPLE3.fastp.json").json + + assert fastp_JSON_single.summary.sequencing == "single end (126 cycles)" + assert fastp_JSON_single.filtering_result.passed_filter_reads == 49986 + + // check output files + def lines = [] + + // KRAKEN2 outputs + lines = path("$launchDir/results/kraken2/B2__kraken2_report.txt").text + + assert lines.contains("97.38 24334 24334 U 0 unclassified") + assert lines.contains(" 2.62 655 2 D 2 Bacteria") + assert lines.contains(" 2.53 631 0 S 562 Escherichia coli") + assert lines.contains(" 2.53 631 631 S2 511145 Escherichia coli str. K-12 substr. MG1655") + assert lines.contains(" 0.09 22 0 S 817 Bacteroides fragilis") + assert lines.contains(" 0.09 22 22 S1 295405 Bacteroides fragilis YCH46") + + // BRACKEN outputs + lines = path("$launchDir/results/bracken/B2__S_bracken.txt").text + + assert lines.contains("100.00 653 0 D 2 Bacteria") + assert lines.contains("96.63 631 631 S 562 Escherichia coli") + assert lines.contains("3.37 22 22 S 817 Bacteroides fragilis") + + lines = path("$launchDir/results/bracken/B2__S_bracken_abundances_unsorted.tsv").text + + assert lines.contains("Escherichia coli 562 S 631 0 631 0.96631") + assert lines.contains("Bacteroides fragilis 817 S 22 0 22 0.03369") + + // ADJUST_BRACKEN outputs + lines = path("$launchDir/results/adjust/B2__S_bracken_abundances.csv").readLines() + + assert lines.contains("B2_,unclassified,0,U,24334,0,24334,0.973866") + assert lines.contains("B2_,Escherichia coli,562,S,631,0,631,0.025253") + assert lines.contains("B2_,Bacteroides fragilis,817,S,22,0,22,0.00088") + + lines = path("$launchDir/results/adjust/B2__S_adjusted_report.txt").text + + assert lines.contains("97.39 24334 24334 U 0 unclassified") + assert lines.contains("2.53 631 631 S 562 Escherichia coli") + assert lines.contains("0.09 22 22 S 817 Bacteroides fragilis") + + // check that KRONA html files exist: + assert path("$launchDir/results/krona/A1__sample1.krona.html").exists() + assert path("$launchDir/results/krona/B2_.krona.html").exists() + assert path("$launchDir/results/krona/B2__SAMPLE3.krona.html").exists() + assert path("$launchDir/results/krona/SAMPLE4.krona.html").exists() + + // check failure_report + assert path("$launchDir/results/failure/failures_report.csv").exists() + + lines = path("$launchDir/results/failure/failures_report.csv").readLines() + + assert lines.size() == 2 + assert lines.contains("sample,module,error_message") + assert lines.contains(",,No samples failed pipeline execution") + + // check IRIDA Next JSON output + + assert path("$launchDir/results/iridanext.output.json").json == path("$projectDir/tests/data/samplename.test_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_global = iridanext_json.files.global + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + // global output file + assert iridanext_global.findAll {it.path == "failure/failures_report.csv" }.size() == 1 + + // samples output files + assert iridanext_samples.SAMPLE2.findAll { it.path == "adjust/B2__S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE2.findAll { it.path == "krona/B2_.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE2.findAll { it.path == "fastp/B2_.fastp.html" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "adjust/B2__SAMPLE3_S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "krona/B2__SAMPLE3.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE3.findAll { it.path == "fastp/B2__SAMPLE3.fastp.html" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "adjust/A1__sample1_S_bracken_abundances.csv" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "krona/A1__sample1.krona.html" }.size() == 1 + assert iridanext_samples.SAMPLE1.findAll { it.path == "fastp/A1__sample1.fastp.html" }.size() == 1 assert iridanext_samples.SAMPLE4.findAll { it.path == "adjust/SAMPLE4_S_bracken_abundances.csv" }.size() == 1 assert iridanext_samples.SAMPLE4.findAll { it.path == "krona/SAMPLE4.krona.html" }.size() == 1 assert iridanext_samples.SAMPLE4.findAll { it.path == "fastp/SAMPLE4.fastp.html" }.size() == 1 From f56b9e7752996978af5b43084b9432e7684ec962 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 16 Sep 2024 13:48:15 -0400 Subject: [PATCH 11/15] Update input_schema --- assets/schema_input.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 2d816ae..c09a283 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -17,7 +17,7 @@ "sample_name": { "type": "string", "meta": ["id"], - "errorMessage": "Sample name must be provided" + "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next." }, "fastq_1": { "type": "string", From 4a457d93cddc9fa2ca8ee8c37168c57a325e9d5f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Tue, 24 Sep 2024 17:22:56 -0400 Subject: [PATCH 12/15] Update documentation --- CHANGELOG.md | 10 ++++++++++ README.md | 10 ++++++++++ assets/samplesheet_minimal.csv | 5 +++++ docs/usage.md | 34 +++++++++++++++++++++++++--------- 4 files changed, 50 insertions(+), 9 deletions(-) create mode 100644 assets/samplesheet_minimal.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b4d1f3..c8bf27e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Development + +### `Changed` + +- Added the ability to include a `sample_name` column in the input samplesheet.csv. Allows for compatibility with IRIDA-Next input configuration [PR24](https://github.com/phac-nml/speciesabundance/pull/24) + ## 2.1.1 - 2024/05/02 ### `Changed` @@ -36,3 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Dependencies` ### `Deprecated` + +[2.0.0]: https://github.com/phac-nml/speciesabundance/releases/tag/2.0.0 +[2.1.0]: https://github.com/phac-nml/speciesabundance/releases/tag/2.1.0 +[2.1.1]: https://github.com/phac-nml/speciesabundance/releases/tag/2.1.1 diff --git a/README.md b/README.md index 5a25273..8308732 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,16 @@ The input to the pipeline is a standard sample sheet (passed as `--input samples The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/). +## IRIDA-Next Optional Input Configuration + +`speciesabundance` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name` + +`sample_name`: An **optional** column, that overrides `sample` for outputs (filenames and sample names) and reference assembly identification. + +`sample_name`, allows more flexibility in naming output files or sample identification. Unlike `sample`, `sample_name` is not required to contain unique values. `Nextflow` requires unique sample names, and therefore in the instance of repeat `sample_names`, `sample` will be suffixed to any `sample_name`. Non-alphanumeric characters (excluding `_`,`-`,`.`) will be replaced with `"_"`. + +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. + # Parameters ## Mandatory diff --git a/assets/samplesheet_minimal.csv b/assets/samplesheet_minimal.csv new file mode 100644 index 0000000..8820b5d --- /dev/null +++ b/assets/samplesheet_minimal.csv @@ -0,0 +1,5 @@ +sample,fastq_1,fastq_2 +SAMPLE1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz +SAMPLE2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz +SAMPLE3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, + diff --git a/docs/usage.md b/docs/usage.md index 9ec95ce..4b4189b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -15,24 +15,40 @@ You will need to create a samplesheet with information about the samples you wou ### Full samplesheet The input samplesheet must contain three columns: `sample`, `fastq_1`, `fastq_2`. The sampleIDs within a samplesheet should be unqiue. All other columns will be ignored. +This pipleine does not support the processing of long-read sequencing data (Nanopore or PacBio). A final samplesheet file consisting of both single- and paired-end Illumina short read data may look something like the one below. -This pipleine does not support the processing of long-read sequencing data (Nanopore or PacBio). -```csv title="samplesheet.csv" +```csv title="samplesheet_minimal.csv" sample,fastq_1,fastq_2 SAMPLE1,sample1_R1.fastq.gz,sample1_R2.fastq.gz SAMPLE2,sample2_R1.fastq.gz,sample2_R2.fastq.gz -SAMPLE3,sample1_R1.fastq.gz, +SAMPLE3,sample3_R1.fastq.gz, +``` + +A minimal [example samplesheet](../assets/samplesheet_minimal.csv) has been provided with the pipeline. + +### IRIDA-Next Optional Samplesheet Configuration + +`speciesabundance` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which contain the following columns: `sample`, `sample_name`, `fastq_1`, and `fastq_2`. The sample IDs within a samplesheet should be unique. + +A final samplesheet file consisting of both single- and paired-end data may look something like the one below. + +```csv title'"samplesheet.csv" +sample,sample_name,fastq_1,fastq_2 +SAMPLE1,A1,sample1_R1.fastq.gz,sample1_R2.fastq.gz +SAMPLE2,B2,sample2_R1.fastq.gz,sample2_R2.fastq.gz +SAMPLE3,C3,sample3_R1.fastq.gz, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. Samples should be unique within a samplesheet. | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| ------------- | -------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. Samples should be unique within a samplesheet. | +| `sample_name` | Sample name used in outputs (filenames and sample names) | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline, which includes the `sample_name` column. ## Running the pipeline From b78388a2947ac8e2659178742f49616d9999312b Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 25 Sep 2024 12:23:27 -0400 Subject: [PATCH 13/15] Updated README.md --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8308732..baebc62 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,8 @@ The input to the pipeline is a standard sample sheet (passed as `--input samples | ------- | --------------- | --------------- | | SampleA | file_1.fastq.gz | file_2.fastq.gz | +A minimal [example samplesheet](../assets/samplesheet_minimal.csv) has been provided with the pipeline. + The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/). ## IRIDA-Next Optional Input Configuration @@ -24,7 +26,13 @@ The structure of this file is defined in [assets/schema_input.json](assets/schem `sample_name`, allows more flexibility in naming output files or sample identification. Unlike `sample`, `sample_name` is not required to contain unique values. `Nextflow` requires unique sample names, and therefore in the instance of repeat `sample_names`, `sample` will be suffixed to any `sample_name`. Non-alphanumeric characters (excluding `_`,`-`,`.`) will be replaced with `"_"`. -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +The sample sheet, when including the optional `sample_name` column, should look like: + +| sample | sample_name | fastq_1 | fastq_2 | +| ------- | ----------- | --------------- | --------------- | +| SampleA | A1 | file_1.fastq.gz | file_2.fastq.gz | + +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline, which includes the `sample_name` column. # Parameters From 1af1a8f11862db81632f58d585c6099bbe91a355 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 25 Sep 2024 17:05:49 -0400 Subject: [PATCH 14/15] Updates to documentation for readability --- CHANGELOG.md | 3 +++ README.md | 4 ++-- assets/samplesheet.csv | 8 ++++---- assets/samplesheet_minimal.csv | 5 ----- docs/usage.md | 4 ++-- 5 files changed, 11 insertions(+), 13 deletions(-) delete mode 100644 assets/samplesheet_minimal.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index c8bf27e..c5e66b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - Added the ability to include a `sample_name` column in the input samplesheet.csv. Allows for compatibility with IRIDA-Next input configuration [PR24](https://github.com/phac-nml/speciesabundance/pull/24) + - `sample_name` special characters will be replaced with `"_"` + - If no `sample_name` is supplied in the column sample will be used + - To avoid repeat values for `sample_name` all `sample_name` values will be suffixed with the unique `sample` value from the input file ## 2.1.1 - 2024/05/02 diff --git a/README.md b/README.md index baebc62..b780f93 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The input to the pipeline is a standard sample sheet (passed as `--input samples | ------- | --------------- | --------------- | | SampleA | file_1.fastq.gz | file_2.fastq.gz | -A minimal [example samplesheet](../assets/samplesheet_minimal.csv) has been provided with the pipeline. +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/). @@ -32,7 +32,7 @@ The sample sheet, when including the optional `sample_name` column, should look | ------- | ----------- | --------------- | --------------- | | SampleA | A1 | file_1.fastq.gz | file_2.fastq.gz | -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline, which includes the `sample_name` column. +An [example samplesheet](../tests/data/samplename_samplesheet.csv) has been provided with the pipeline, which includes the `sample_name` column. # Parameters diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 46a711c..9986717 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ -sample,sample_name,fastq_1,fastq_2 -SAMPLE1,A1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz -SAMPLE2,B2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz -SAMPLE3,C3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, +sample,fastq_1,fastq_2 +SAMPLE1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz +SAMPLE2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz +SAMPLE3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, diff --git a/assets/samplesheet_minimal.csv b/assets/samplesheet_minimal.csv deleted file mode 100644 index 8820b5d..0000000 --- a/assets/samplesheet_minimal.csv +++ /dev/null @@ -1,5 +0,0 @@ -sample,fastq_1,fastq_2 -SAMPLE1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz -SAMPLE2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz -SAMPLE3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, - diff --git a/docs/usage.md b/docs/usage.md index 4b4189b..8c4e97d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -26,7 +26,7 @@ SAMPLE2,sample2_R1.fastq.gz,sample2_R2.fastq.gz SAMPLE3,sample3_R1.fastq.gz, ``` -A minimal [example samplesheet](../assets/samplesheet_minimal.csv) has been provided with the pipeline. +A [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. ### IRIDA-Next Optional Samplesheet Configuration @@ -48,7 +48,7 @@ SAMPLE3,C3,sample3_R1.fastq.gz, | `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | | `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline, which includes the `sample_name` column. +An [example samplesheet](../tests/data/samplename_samplesheet.csv) has been provided with the pipeline, which includes the `sample_name` column. ## Running the pipeline From f6e6b0f9fa964b49b810da3c34f3eef44c24c1a6 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 25 Sep 2024 17:24:26 -0400 Subject: [PATCH 15/15] Edits to README and usage docs --- README.md | 2 +- assets/samplesheet.csv | 8 ++++---- assets/samplesheet_minimal.csv | 4 ++++ docs/usage.md | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) create mode 100644 assets/samplesheet_minimal.csv diff --git a/README.md b/README.md index b780f93..07fb475 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The input to the pipeline is a standard sample sheet (passed as `--input samples | ------- | --------------- | --------------- | | SampleA | file_1.fastq.gz | file_2.fastq.gz | -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +An [example samplesheet](../assets/samplesheet_minimal.csv) has been provided with the pipeline. The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/). diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 9986717..46a711c 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ -sample,fastq_1,fastq_2 -SAMPLE1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz -SAMPLE2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz -SAMPLE3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, +sample,sample_name,fastq_1,fastq_2 +SAMPLE1,A1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz +SAMPLE2,B2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz +SAMPLE3,C3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, diff --git a/assets/samplesheet_minimal.csv b/assets/samplesheet_minimal.csv new file mode 100644 index 0000000..9986717 --- /dev/null +++ b/assets/samplesheet_minimal.csv @@ -0,0 +1,4 @@ +sample,fastq_1,fastq_2 +SAMPLE1,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R2.fastq.gz +SAMPLE2,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_sample2_R2.fastq.gz +SAMPLE3,https://github.com/nf-core/test-datasets/raw/mag/test_data/test_minigut_R1.fastq.gz, diff --git a/docs/usage.md b/docs/usage.md index 8c4e97d..a728f3b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -26,7 +26,7 @@ SAMPLE2,sample2_R1.fastq.gz,sample2_R2.fastq.gz SAMPLE3,sample3_R1.fastq.gz, ``` -A [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +A [example samplesheet](../assets/samplesheet_minimal.csv) has been provided with the pipeline. ### IRIDA-Next Optional Samplesheet Configuration