Merge pull request #24 from phac-nml/dev

0.3.0 Release
phac-nml · Oct 21, 2024 · 6c90cff · 6c90cff
2 parents 16aebef + a167c35
commit 6c90cff
Show file tree

Hide file tree

Showing 27 changed files with 312 additions and 86 deletions.
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -42,17 +42,32 @@ jobs:
           python-version: "3.11"
           architecture: "x64"
 
+      - name: read .nf-core.yml
+        uses: pietrobolcato/[email protected]
+        id: read_yml
+        with:
+          config: ${{ github.workspace }}/.nf-core.yml
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install nf-core
+          pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }}
+
+      - name: Run nf-core pipelines lint
+        if: ${{ github.base_ref != 'main' }}
+        env:
+          GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
+        run: nf-core -l lint_log.txt pipelines lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
 
-      - name: Run nf-core lint
+      - name: Run nf-core pipelines lint --release
+        if: ${{ github.base_ref == 'master' }}
         env:
           GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
-        run: nf-core -l lint_log.txt lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
+        run: nf-core -l lint_log.txt pipelines lint --release --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
 
       - name: Save PR number
         if: ${{ always() }}

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3
+        uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6
         with:
           workflow: linting.yml
           workflow_conclusion: completed

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,4 +1,5 @@
 repository_type: pipeline
+nf_core_version: "3.0.1"
 
 lint:
   files_exist:
@@ -31,5 +32,8 @@ lint:
     - custom_config
     - manifest.name
     - manifest.homePage
+    - params.max_cpus
+    - params.max_memory
+    - params.max_time
   readme:
     - nextflow_badge
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,16 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.3.0] - 2024-10-21
+
+- Added the ability to include a `sample_name` column in the input samplesheet.csv. Allows for compatibility with IRIDA-Next input configuration.
+
+  - `sample_name` special characters will be replaced with `"_"`
+  - If no `sample_name` is supplied in the column `sample` will be used
+  - To avoid repeat values for `sample_name` all `sample_name` values will be suffixed with the unique `sample` value from the input file
+
+  - Fixed linting issues in CI caused by nf-core 3.0.1
+
 ## [0.2.0] - 2024-09-05
 
 ### Changed
@@ -25,3 +35,4 @@ Initial release of the arboratornf pipeline to be used for running [Arborator](h
 
 [0.1.0]: https://github.com/phac-nml/arboratornf/releases/tag/0.1.0
 [0.2.0]: https://github.com/phac-nml/arboratornf/releases/tag/0.2.0
+[0.3.0]: https://github.com/phac-nml/arboratornf/releases/tag/0.3.0
diff --git a/README.md b/README.md
@@ -32,6 +32,16 @@ An example of the sample sheet is available in [tests/data/samplesheets/samplesh
 
 Furthermore, the structure of the sample sheet is programmatically defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/).
 
+## IRIDA-Next Optional Input Configuration
+
+`arboratornf` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name`
+
+`sample_name`: An **optional** column, that overrides `sample` for outputs (filenames and sample names) and reference assembly identification.
+
+`sample_name`, allows more flexibility in naming output files or sample identification. Unlike `sample`, `sample_name` is not required to contain unique values. `Nextflow` requires unique sample names, and therefore in the instance of repeat `sample_names`, `sample` will be suffixed to any `sample_name`. Non-alphanumeric characters (excluding `_`,`-`,`.`) will be replaced with `"_"`.
+
+An [example samplesheet](../tests/data/samplesheets/samplesheet-samplename.csv) has been provided with the pipeline.
+
 # Parameters
 
 The mandatory parameters are `--input`, which specifies the samplesheet as described above, and `--output`, which specifies the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run. Metadata-related parameters are described above in [Input](#input).

diff --git a/assets/config_lookup.json b/assets/config_lookup.json
@@ -2,7 +2,7 @@
     "outlier_thresh": "25",
     "min_cluster_members": 2,
     "partition_column_name": "outbreak",
-    "id_column_name": "sample",
+    "id_column_name": "sample_name",
     "only_report_labeled_columns": "False",
     "skip_qa": "False",
 
@@ -62,6 +62,7 @@
         "organism": { "data_type": "None", "label": "Organism", "default": "", "display": "True" },
         "outbreak": { "data_type": "None", "label": "Outbreak Code", "default": "", "display": "True" },
         "sample": { "data_type": "None", "label": "Sample", "default": "", "display": "True" },
+        "sample_name": { "data_type": "None", "label": "Sample", "default": "", "display": "True" },
         "serovar": { "data_type": "Categorical", "label": "Serovar", "default": "", "display": "True" },
         "special": { "data_type": "Categorical", "label": "Special", "default": "", "display": "True" },
         "source": { "data_type": "Categorical", "label": "Source Type", "default": "", "display": "True" },

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft-07/schema",
     "$id": "https://raw.githubusercontent.com/phac-nml/arboratornf/main/assets/schema_input.json",
     "title": "phac-nml/arboratornf pipeline - params.input schema",
     "description": "Schema for the file provided with params.input",
@@ -10,10 +10,15 @@
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "meta": ["id"],
+                "meta": ["irida_id"],
                 "unique": true,
                 "errorMessage": "Sample name must be provided and cannot contain spaces."
             },
+            "sample_name": {
+                "type": "string",
+                "meta": ["id"],
+                "errorMessage": "Sample name is optional, if provided will replace sample for filenames and outputs"
+            },
             "mlst_alleles": {
                 "type": "string",
                 "format": "file-path",

diff --git a/conf/iridanext.config b/conf/iridanext.config
@@ -5,7 +5,7 @@ iridanext {
         overwrite = true
         validate = true
         files {
-            idkey = "id"
+            idkey = "irida_id"
             global = [
                 "**/arborator/cluster_summary.tsv",
                 "**/arborator/metadata.included.tsv",
@@ -22,4 +22,4 @@ iridanext {
             ]
         }
     }
-}
+}
diff --git a/docs/usage.md b/docs/usage.md
@@ -12,7 +12,7 @@ You will need to create a samplesheet with information about the samples you wou
 --input '[path to samplesheet file]'
 ```
 
-### Full samplesheet
+### Full Standard Samplesheet
 
 The input samplesheet must contain the following columns: `sample`, `mlst_alleles`, `metadata_partition`, and `metadata_1` through `metadata_8`. The IDs (sample column) within a samplesheet should be unique and contain no spaces. Any other additionally specified trailing columns will be ignored.
 
@@ -37,6 +37,29 @@ S6,S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
+### IRIDA-Next Optional Samplesheet Configuration
+
+`arboratornf` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which contain the following columns: `sample`, `sample_name`, `fastq_1`, `fastq_2`, `reference_assembly`, and `metadata_1` - `metadata_8`. The sample IDs within a samplesheet should be unique.
+
+A final samplesheet file consisting of both single- and paired-end data may look something like the one below.
+
+```console
+sample,sample_name,fastq_1,fastq_2,reference_assembly,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
+SAMPLE1,A1,/path/to/sample1_fastq1.fq,/path/to/sample1_fastq2.fq,/path/to/sample1_assembly.fa,,,,,,,,
+SAMPLE2,B2,/path/to/sample2_fastq1.fq,,,,,,,,,,
+```
+
+| Column                       | Description                                                                                                                                                                            |
+| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`                     | Custom sample name. Samples should be unique within a samplesheet.                                                                                                                     |
+| `sample_name`                | Sample name used in outputs (filenames and sample names)                                                                                                                               |
+| `fastq_1`                    | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
+| `fastq_2`                    | (Optional) Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                  |
+| `reference_assembly`         | (Optional) Full path to a FASTA file representing a reference assembly derived from this sample. This field provides a method for selecting a reference genome for the whole pipeline. |
+| `metadata_1` to `metadata_8` | (Optional) Permits up to 8 columns for user-defined contextual metadata associated with each `sample`.                                                                                 |
+
+An [example samplesheet](../tests/data/samplesheets/samplesheet-samplename.csv) has been provided with the pipeline.
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:

diff --git a/modules/local/buildconfig/main.nf b/modules/local/buildconfig/main.nf
@@ -31,7 +31,7 @@ process BUILD_CONFIG {
     def json_linelist = [:]
 
     def id = metadata_headers[0]
-    def PARTITION_INDEX = 1
+    def PARTITION_INDEX = 2
     def partition = metadata_headers[PARTITION_INDEX]
 
     // GENERAL

diff --git a/nextflow.config b/nextflow.config
@@ -90,7 +90,6 @@ profiles {
     }
     docker {
         docker.enabled         = true
-        docker.userEmulation   = true
         conda.enabled          = false
         singularity.enabled    = false
         podman.enabled         = false
@@ -167,6 +166,7 @@ singularity.registry = 'quay.io'
 
 // Override the default Docker registry when required
 process.ext.override_configured_container_registry = true
+process.containerOptions = '-u $(id -u):$(id -g)'
 
 // Nextflow plugins
 plugins {
@@ -213,7 +213,7 @@ manifest {
     description     = """Arborator: Genomic Profile Clustering and Summary"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.2.0'
+    version         = '0.3.0'
     doi             = ''
     defaultBranch   = 'main'
 }

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft-07/schema",
     "$id": "https://raw.githubusercontent.com/phac-nml/arboratornf/main/nextflow_schema.json",
     "title": "phac-nml/arboratornf pipeline parameters",
     "description": "IRIDA Next Example Pipeline",

diff --git a/tests/data/arborator/basic/metadata.included.tsv b/tests/data/arborator/basic/metadata.included.tsv
@@ -1,7 +1,7 @@
-sample	outbreak	organism	subtype	country	serovar	age	date	source	special
-S1	1	Escherichia coli	EHEC/STEC	Canada	O157:H7	21	2024/05/30	beef	True
-S2	1	Escherichia coli	EHEC/STEC	The United States	O157:H7	55	2024/05/21	milk	False
-S3	2	Escherichia coli	EPEC	France	O125	14	2024/04/30	cheese	True
-S4	2	Escherichia coli	EPEC	France	O125	35	2024/04/22	cheese	True
-S5	3	Escherichia coli	EAEC	Canada	O126:H27	61	2012/09/01	milk	False
-S6	unassociated	Escherichia coli	EAEC	Canada	O111:H21	43	2011/12/25	fruit	False
+sample_name	outbreak	organism	subtype	country	serovar	age	date	source	special	sample
+S1	1	Escherichia coli	EHEC/STEC	Canada	O157:H7	21	2024/05/30	beef	True	S1
+S2	1	Escherichia coli	EHEC/STEC	The United States	O157:H7	55	2024/05/21	milk	False	S2
+S3	2	Escherichia coli	EPEC	France	O125	14	2024/04/30	cheese	True	S3
+S4	2	Escherichia coli	EPEC	France	O125	35	2024/04/22	cheese	True	S4
+S5	3	Escherichia coli	EAEC	Canada	O126:H27	61	2012/09/01	milk	False	S5
+S6	unassociated	Escherichia coli	EAEC	Canada	O111:H21	43	2011/12/25	fruit	False	S6
diff --git a/tests/data/arborator/little_metadata/metadata.included.tsv b/tests/data/arborator/little_metadata/metadata.included.tsv
@@ -1,7 +1,7 @@
-sample	outbreak	organism	subtype
-S1	1	Escherichia coli	EHEC/STEC
-S2	1	Escherichia coli	EHEC/STEC
-S3	2	Escherichia coli	EPEC
-S4	2	Escherichia coli	EPEC
-S5	3	Escherichia coli	EAEC
-S6	unassociated	Escherichia coli	EAEC
+sample_name	outbreak	organism	subtype	sample
+S1	1	Escherichia coli	EHEC/STEC	S1
+S2	1	Escherichia coli	EHEC/STEC	S2
+S3	2	Escherichia coli	EPEC	S3
+S4	2	Escherichia coli	EPEC	S4
+S5	3	Escherichia coli	EAEC	S5
+S6	unassociated	Escherichia coli	EAEC	S6
diff --git a/tests/data/arborator/mismatch/metadata.included.tsv b/tests/data/arborator/mismatch/metadata.included.tsv
@@ -1,7 +1,7 @@
-sample	outbreak	organism	subtype	country	serovar	age	date	source	special
-S1	1	Escherichia coli	EHEC/STEC	Canada	O157:H7	21	2024/05/30	beef	True
-MISMATCH	1	Escherichia coli	EHEC/STEC	The United States	O157:H7	55	2024/05/21	milk	False
-S3	2	Escherichia coli	EPEC	France	O125	14	2024/04/30	cheese	True
-S4	2	Escherichia coli	EPEC	France	O125	35	2024/04/22	cheese	True
-S5	3	Escherichia coli	EAEC	Canada	O126:H27	61	2012/09/01	milk	False
-S6	unassociated	Escherichia coli	EAEC	Canada	O111:H21	43	2011/12/25	fruit	False
+sample_name	outbreak	organism	subtype	country	serovar	age	date	source	special	sample
+S1	1	Escherichia coli	EHEC/STEC	Canada	O157:H7	21	2024/05/30	beef	True	S1
+MISMATCH	1	Escherichia coli	EHEC/STEC	The United States	O157:H7	55	2024/05/21	milk	False	MISMATCH
+S3	2	Escherichia coli	EPEC	France	O125	14	2024/04/30	cheese	True	S3
+S4	2	Escherichia coli	EPEC	France	O125	35	2024/04/22	cheese	True	S4
+S5	3	Escherichia coli	EAEC	Canada	O126:H27	61	2012/09/01	milk	False	S5
+S6	unassociated	Escherichia coli	EAEC	Canada	O111:H21	43	2011/12/25	fruit	False	S6
diff --git a/tests/data/arborator/samplenames_metadata/metadata.included.tsv b/tests/data/arborator/samplenames_metadata/metadata.included.tsv
@@ -0,0 +1,7 @@
+sample_name	outbreak	organism	subtype	country	serovar	age	date	source	special	sample
+sample1	1	Escherichia coli	EHEC/STEC	Canada	O157:H7	21	2024/05/30	beef	True	S1
+sample_2	1	Escherichia coli	EHEC/STEC	The United States	O157:H7	55	2024/05/21	milk	False	S2
+sample_3	2	Escherichia coli	EPEC	France	O125	14	2024/04/30	cheese	True	S3
+sample4	2	Escherichia coli	EPEC	France	O125	35	2024/04/22	cheese	True	S4
+sample4_S5	3	Escherichia coli	EAEC	Canada	O126:H27	61	2012/09/01	milk	False	S5
+S6	unassociated	Escherichia coli	EAEC	Canada	O111:H21	43	2011/12/25	fruit	False	S6
diff --git a/tests/data/configs/autoconfig_little-metadata.json b/tests/data/configs/autoconfig_little-metadata.json
@@ -2,7 +2,7 @@
     "outlier_thresh": "25",
     "min_cluster_members": 2,
     "partition_column_name": "outbreak",
-    "id_column_name": "sample",
+    "id_column_name": "sample_name",
     "only_report_labeled_columns": "False",
     "skip_qa": "False",
     "grouped_metadata_columns": {
@@ -26,7 +26,7 @@
         }
     },
     "linelist_columns": {
-        "sample": {
+        "sample_name": {
             "data_type": "None",
             "label": "Sample",
             "default": "",

diff --git a/tests/data/configs/autoconfig_samplesheet.json b/tests/data/configs/autoconfig_samplesheet.json
@@ -2,7 +2,7 @@
     "outlier_thresh": "25",
     "min_cluster_members": 2,
     "partition_column_name": "outbreak",
-    "id_column_name": "sample",
+    "id_column_name": "sample_name",
     "only_report_labeled_columns": "False",
     "skip_qa": "False",
     "grouped_metadata_columns": {
@@ -62,7 +62,7 @@
         }
     },
     "linelist_columns": {
-        "sample": {
+        "sample_name": {
             "data_type": "None",
             "label": "Sample",
             "default": "",

diff --git a/tests/data/configs/config.json b/tests/data/configs/config.json
@@ -2,11 +2,11 @@
         "outlier_thresh": "25",
         "min_cluster_members": 2,
         "partition_column_name": "outbreak",
-        "id_column_name": "sample_id",
+        "id_column_name": "sample_name",
         "only_report_labeled_columns": "False",
         "skip_qa": "False",
-        
-        "grouped_metadata_columns":{ 
+
+        "grouped_metadata_columns":{
             "outbreak":{"data_type": "None","label":"National Outbreak Code","default":"","display":"True"},
             "organism":{"data_type": "None","label":"Organism","default":"","display":"True"},
             "subtype":{"data_type": "None","label":"Subtype","default":"","display":"True"},
@@ -19,7 +19,7 @@
         },
 
         "linelist_columns":{
-            "sample":{"data_type": "None","label":"Sample","default":"","display":"True"},
+            "sample_name":{"data_type": "None","label":"Sample","default":"","display":"True"},
             "outbreak":{"data_type": "None","label":"National Outbreak Code","default":"","display":"True"},
             "organism":{"data_type": "None","label":"Organism","default":"","display":"True"},
             "subtype":{"data_type": "None","label":"Subtype","default":"","display":"True"},
@@ -29,5 +29,5 @@
             "date":{"data_type": "min_max","label":"Date","default":"","display":"True"},
             "source":{"data_type": "categorical","label":"Source Type","default":"","display":"True"},
             "special":{"data_type": "categorical","label":"Special","default":"","display":"True"}
-        }    
+        }
     }
diff --git a/tests/data/metadata/expected_merged_data.tsv b/tests/data/metadata/expected_merged_data.tsv
@@ -1,7 +1,7 @@
-sample	outbreak	organism	subtype	country	serovar	age	date	source	special
-S1	1	Escherichia coli	EHEC/STEC	Canada	O157:H7	21	2024/05/30	beef	true
-S2	1	Escherichia coli	EHEC/STEC	The United States	O157:H7	55	2024/05/21	milk	false
-S3	2	Escherichia coli	EPEC	France	O125	14	2024/04/30	cheese	true
-S4	2	Escherichia coli	EPEC	France	O125	35	2024/04/22	cheese	true
-S5	3	Escherichia coli	EAEC	Canada	O126:H27	61	2012/09/01	milk	false
-S6	unassociated	Escherichia coli	EAEC	Canada	O111:H21	43	2011/12/25	fruit	false
+sample_name	sample	outbreak	organism	subtype	country	serovar	age	date	source	special
+S1	S1	1	Escherichia coli	EHEC/STEC	Canada	O157:H7	21	2024/05/30	beef	true
+S2	S2	1	Escherichia coli	EHEC/STEC	The United States	O157:H7	55	2024/05/21	milk	false
+S3	S3	2	Escherichia coli	EPEC	France	O125	14	2024/04/30	cheese	true
+S4	S4	2	Escherichia coli	EPEC	France	O125	35	2024/04/22	cheese	true
+S5	S5	3	Escherichia coli	EAEC	Canada	O126:H27	61	2012/09/01	milk	false
+S6	S6	unassociated	Escherichia coli	EAEC	Canada	O111:H21	43	2011/12/25	fruit	false
diff --git a/tests/data/metadata/little-metadata-merged.tsv b/tests/data/metadata/little-metadata-merged.tsv
@@ -1,7 +1,7 @@
-sample	outbreak	organism	subtype
-S1	1	Escherichia coli	EHEC/STEC
-S2	1	Escherichia coli	EHEC/STEC
-S3	2	Escherichia coli	EPEC
-S4	2	Escherichia coli	EPEC
-S5	3	Escherichia coli	EAEC
-S6	unassociated	Escherichia coli	EAEC
+sample_name	sample	outbreak	organism	subtype
+S1	S1	1	Escherichia coli	EHEC/STEC
+S2	S2	1	Escherichia coli	EHEC/STEC
+S3	S3	2	Escherichia coli	EPEC
+S4	S4	2	Escherichia coli	EPEC
+S5	S5	3	Escherichia coli	EAEC
+S6	S6	unassociated	Escherichia coli	EAEC