From a2460149377d2020b05ff9136e8da3f06d0658ca Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 10 Oct 2024 14:47:16 -0400 Subject: [PATCH] Added irida-id to new_addresses output to be compatible with IRIDA-Next --- conf/iridanext.config | 10 ++++++--- docs/output.md | 2 +- modules/local/filter_query/main.nf | 9 +++++--- .../sample_name_add_iridanext.output.json | 22 +++++++++++++------ .../samplesheets/samplesheet-sample_name.csv | 1 + tests/pipelines/main.nf.test | 19 ++++++++-------- workflows/gas_nomenclature.nf | 14 +++++++++--- 7 files changed, 51 insertions(+), 26 deletions(-) diff --git a/conf/iridanext.config b/conf/iridanext.config index 63e181e..95a423e 100644 --- a/conf/iridanext.config +++ b/conf/iridanext.config @@ -4,14 +4,18 @@ iridanext { path = "${params.outdir}/iridanext.output.json.gz" overwrite = true files { + idkey = "irida_id" samples = ["**/input/*_error_report.csv"] } metadata { - idkey = "id_irida" samples { + keep = [ + "address" + ] csv { - path = "**/filter/new_addresses.csv" - idcol = "id" + path = "**/filter/new_addresses.tsv" + sep = "\t" + idcol = 'irida_id' } } } diff --git a/docs/output.md b/docs/output.md index 27a33c2..6d66634 100644 --- a/docs/output.md +++ b/docs/output.md @@ -93,7 +93,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d Output files - `filter/` - - `new_addresses.csv` + - `new_addresses.tsv` diff --git a/modules/local/filter_query/main.nf b/modules/local/filter_query/main.nf index fec0ea1..a196ffa 100644 --- a/modules/local/filter_query/main.nf +++ b/modules/local/filter_query/main.nf @@ -13,7 +13,7 @@ process FILTER_QUERY { val out_format output: - path("new_addresses.*"), emit: csv + path("new_addresses.*"), emit: tsv path("versions.yml"), emit: versions script: @@ -24,13 +24,16 @@ process FILTER_QUERY { """ # Filter the query samples only; keep only the 'id' and 'address' columns + csvtk cut -t -f 2 ${query_ids} > query_list.txt # Need to use the second column to pull meta.id because there is no header + csvtk add-header ${query_ids} -t -n irida_id,id > id.txt csvtk grep \\ ${addresses} \\ -f 1 \\ - -P ${query_ids} \\ + -P query_list.txt \\ --delimiter "${delimiter}" \\ --out-delimiter "${out_delimiter}" | \\ - csvtk cut -f id,address > ${outputFile}.${out_extension} + csvtk cut -t -f id,address > tmp.tsv + csvtk join -t -f id id.txt tmp.tsv > ${outputFile}.${out_extension} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/tests/data/irida/sample_name_add_iridanext.output.json b/tests/data/irida/sample_name_add_iridanext.output.json index acc2745..92fce05 100644 --- a/tests/data/irida/sample_name_add_iridanext.output.json +++ b/tests/data/irida/sample_name_add_iridanext.output.json @@ -1,30 +1,38 @@ { "files": { "global": [ - + ], "samples": { - "sample_1": [ + "sampleQ": [ { "path": "input/sample_1_error_report.csv" } ], - "sample_2_sample2": [ + "sample1": [ + { + "path": "input/sample_2_error_report.csv" + } + ], + "sample2": [ { "path": "input/sample_2_sample2_error_report.csv" } ], - "sample_2": [ + "sampleR": [ { - "path": "input/sample_2_error_report.csv" + "path": "input/sample4_error_report.csv" } ] } }, "metadata": { "samples": { - "sample_1": { - "address": "1.1.3" + "sampleQ": { + "address": "2.2.3" + }, + "sampleR": { + "address": "2.2.3" } } } diff --git a/tests/data/samplesheets/samplesheet-sample_name.csv b/tests/data/samplesheets/samplesheet-sample_name.csv index c6c73d5..fb51952 100644 --- a/tests/data/samplesheets/samplesheet-sample_name.csv +++ b/tests/data/samplesheets/samplesheet-sample_name.csv @@ -3,3 +3,4 @@ sampleQ,sample 1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/ sample1,sample#2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 sample2,sample#2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 sample3,,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 +sampleR,sample4,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 5352627..94e8090 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -221,9 +221,9 @@ nextflow_pipeline { assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") // Check filter_query csv file - lines = path("$launchDir/results/filter/new_addresses.csv").readLines() - assert lines.contains("sampleQ,2.2.3") - assert lines.contains("sampleR,2.2.3") + lines = path("$launchDir/results/filter/new_addresses.tsv").readLines() + assert lines.contains("sampleQ\tsampleQ\t2.2.3") + assert lines.contains("sampleR\tsampleR\t2.2.3") // Check IRIDA Next JSON output assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/mismatched_iridanext.output.json").json @@ -271,8 +271,8 @@ nextflow_pipeline { assert lines.contains('sample3,"[\'extra_key\', \'sample3\']","MLST JSON file (sample3_multiplekeys.mlst.json) contains multiple keys: [\'extra_key\', \'sample3\']. The MLST JSON file has been modified to retain only the \'sample3\' entry"') // Check filtered query csv results - lines = path("$launchDir/results/filter/new_addresses.csv").readLines() - assert lines.contains("sampleQ,1.1.3") + lines = path("$launchDir/results/filter/new_addresses.tsv").readLines() + assert lines.contains("sampleQ\tsampleQ\t1.1.3") // Check IRIDA Next JSON output assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json @@ -320,8 +320,8 @@ nextflow_pipeline { assert lines.contains('sample3,"[\'extra_key\', \'sample4\']",No key in the MLST JSON file (sample3_multiplekeys_nomatch.mlst.json) matches the specified sample ID \'sample3\'. The first key \'extra_key\' has been forcefully changed to \'sample3\' and all other keys have been removed.') // Check filtered query csv results - lines = path("$launchDir/results/filter/new_addresses.csv").readLines() - assert lines.contains("sampleQ,1.1.3") + lines = path("$launchDir/results/filter/new_addresses.tsv").readLines() + assert lines.contains("sampleQ\tsampleQ\t1.1.3") // Check IRIDA Next JSON output assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json @@ -385,8 +385,9 @@ nextflow_pipeline { assert lines.contains("sample_2_sample2,[\'sample2\'],Reference sample_2_sample2 ID and JSON key in sample2.mlst.json DO NOT MATCH. The 'sample2' key in sample2.mlst.json has been forcefully changed to 'sample_2_sample2': User should manually check input files to ensure correctness.") // Check filter_query csv file - lines = path("$launchDir/results/filter/new_addresses.csv").readLines() - assert lines.contains("sample_1,1.1.3") + lines = path("$launchDir/results/filter/new_addresses.tsv").readLines() + assert lines.contains("sampleQ\tsample_1\t2.2.3") + assert lines.contains("sampleR\tsample4\t2.2.3") // Check IRIDA Next JSON output assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/sample_name_add_iridanext.output.json").json diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 4101a69..6899ec7 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -106,6 +106,11 @@ workflow GAS_NOMENCLATURE { reference_values = input_assure.result.collect{ meta, mlst -> mlst} query_values = profiles.query.collect{ meta, mlst -> mlst } + // Query Map: Use to return meta.irida_id to output for mapping to IRIDA-Next JSON + query_map = profiles.query.map{ meta, mlst-> + tuple(meta.id, meta.irida_id) + }.collect() + // LOCIDEX modules ref_tag = Channel.value("ref") query_tag = Channel.value("value") @@ -166,16 +171,19 @@ workflow GAS_NOMENCLATURE { called_data = GAS_CALL(expected_clusters.text, distances.results) ch_versions = ch_versions.mix(called_data.versions) - // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in - query_ids = profiles.query.collectFile { it[0].id + '\n' } + // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in and + // add a column with IRIDA ID to allow for IRIDANext plugin to include metadata + query_irida_ids = profiles.query.collectFile { it[0].irida_id + '\t' + it[0].id + '\n'} - new_addresses = FILTER_QUERY(query_ids, called_data.distances, "tsv", "csv") + new_addresses = FILTER_QUERY(query_irida_ids, called_data.distances, "tsv", "tsv") ch_versions = ch_versions.mix(new_addresses.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) + + } /*