phac-nml · kylacochrane · Sep 5, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,15 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.2.0] - 2024-09-05
+
+### Changed
+
+- Upgraded `locidex/merge` to version `0.2.3` and updated `input_assure.py` and test data for compatibility with the new `mlst.json` allele file format.
+  - [PR19](https://github.com/phac-nml/arboratornf/pull/19)
+
+This pipeline is now compatible only with output generated by [Locidex v0.2.3+](https://github.com/phac-nml/locidex) and [Mikrokondo v0.4.0+](https://github.com/phac-nml/mikrokondo/releases/tag/v0.4.0).
+
 ## [0.1.0] - 2024-08-20
 
 Initial release of the arboratornf pipeline to be used for running [Arborator](https://github.com/phac-nml/arborator) under Nextflow.
@@ -15,3 +24,4 @@ Initial release of the arboratornf pipeline to be used for running [Arborator](h
 - ArborView integration.
 
 [0.1.0]: https://github.com/phac-nml/arboratornf/releases/tag/0.1.0
+[0.2.0]: https://github.com/phac-nml/arboratornf/releases/tag/0.2.0
diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,7 +1,7 @@
 sample,mlst_alleles,metadata_partition,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
-S1,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S1.mlst.json,1,"Escherichia coli","EHEC/STEC","Canada","O157:H7",21,"2024/05/30","beef",true
-S2,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S2.mlst.json,1,"Escherichia coli","EHEC/STEC","The United States","O157:H7",55,"2024/05/21","milk",false
-S3,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S3.mlst.json,2,"Escherichia coli","EPEC","France","O125",14,"2024/04/30","cheese",true
-S4,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S4.mlst.json,2,"Escherichia coli","EPEC","France","O125",35,"2024/04/22","cheese",true
-S5,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S5.mlst.json,3,"Escherichia coli","EAEC","Canada","O126:H27",61,"2012/09/01","milk",false
-S6,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2011/12/25","fruit",false
+S1,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S1.mlst.json,1,"Escherichia coli","EHEC/STEC","Canada","O157:H7",21,"2024/05/30","beef",true
+S2,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S2.mlst.json,1,"Escherichia coli","EHEC/STEC","The United States","O157:H7",55,"2024/05/21","milk",false
+S3,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S3.mlst.json,2,"Escherichia coli","EPEC","France","O125",14,"2024/04/30","cheese",true
+S4,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S4.mlst.json,2,"Escherichia coli","EPEC","France","O125",35,"2024/04/22","cheese",true
+S5,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S5.mlst.json,3,"Escherichia coli","EAEC","Canada","O126:H27",61,"2012/09/01","milk",false
+S6,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2011/12/25","fruit",false
diff --git a/bin/input_assure.py b/bin/input_assure.py
@@ -19,38 +19,41 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f
     with open_file(json_file, "rt") as f:
         json_data = json.load(f)
 
+    # Extract the profile from the json_data
+    profile = json_data.get("data", {}).get("profile", {})
+    # Check for multiple keys in the JSON file and define error message
+    keys = sorted(profile.keys())
+    original_key = keys[0] if keys else None
+
     # Define a variable to store the match_status (True or False)
-    match_status = sample_id in json_data
+    match_status = sample_id in profile
 
     # Initialize the error message
     error_message = None
 
-    # Check for multiple keys in the JSON file and define error message
-    keys = list(json_data.keys())
-    original_key = keys[0] if keys else None
-
-    if len(keys) == 0:
-        error_message = f"{json_file} is completely empty!"
+    if not keys:
+        error_message = f"{json_file} is missing the 'profile' section or is completely empty!"
         print(error_message)
         sys.exit(1)
     elif len(keys) > 1:
         # Check if sample_id matches any key
         if not match_status:
             error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
             # Retain only the specified sample ID
-            json_data = {sample_id: json_data.pop(original_key)}
+            json_data["data"]["profile"] = {sample_id: profile.pop(original_key)}
         else:
             error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry"
-            # Remove all keys expect the one matching sample_id
-            json_data = {sample_id: json_data[sample_id]}
+            # Retain only the specified sample_id in the profile
+            json_data["data"]["profile"] = {sample_id: profile[sample_id]}
     elif not match_status:
         # Define error message based on meta.address (query or reference)
         if address == "null":
             error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
         else:
             error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
         # Update the JSON file with the new sample ID
-        json_data[sample_id] = json_data.pop(original_key)
+        json_data["data"]["profile"] = {sample_id: profile.pop(original_key)}
+        json_data["data"]["sample_name"] = sample_id
 
     # Write file containing relevant error messages
     if error_message:
@@ -69,21 +72,11 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f
         description="Check sample inputs, force change if ID ≠ KEY, and generate an error report."
     )
     parser.add_argument("--input", help="Path to the mlst.json file.", required=True)
-    parser.add_argument(
-        "--sample_id", help="Sample ID to check in the JSON file.", required=True
-    )
-    parser.add_argument(
-        "--address", help="Address to use in the error message.", required=True
-    )
-    parser.add_argument(
-        "--output_error", help="Path to the error report file.", required=True
-    )
-    parser.add_argument(
-        "--output_json", help="Path to the MLST JSON file (gzipped).", required=True
-    )
+    parser.add_argument("--sample_id", help="Sample ID to check in the JSON file.", required=True)
+    parser.add_argument("--address", help="Address to use in the error message.", required=True)
+    parser.add_argument("--output_error", help="Path to the error report file.", required=True)
+    parser.add_argument("--output_json", help="Path to the MLST JSON file (gzipped).", required=True)
 
     args = parser.parse_args()
 
-    check_inputs(
-        args.input, args.sample_id, args.address, args.output_error, args.output_json
-    )
+    check_inputs(args.input, args.sample_id, args.address, args.output_error, args.output_json)
diff --git a/conf/test.config b/conf/test.config
@@ -20,7 +20,7 @@ params {
     max_time   = '1.h'
 
     // Input data
-    input  = 'https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/samplesheets/samplesheet.csv'
+    input  = "${projectDir}/tests/data/samplesheets/samplesheet.csv"
 
     outdir = "results"
 

diff --git a/modules/local/locidex/merge/main.nf b/modules/local/locidex/merge/main.nf
@@ -7,8 +7,9 @@ process LOCIDEX_MERGE {
     label 'process_medium'
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-    'https://depot.galaxyproject.org/singularity/locidex:0.1.1--pyhdfd78af_0' :
-    'biocontainers/locidex:0.1.1--pyhdfd78af_0' }"
+    "docker.io/mwells14/locidex:0.2.3" :
+    task.ext.override_configured_container_registry != false ? 'docker.io/mwells14/locidex:0.2.3' :
+    'mwells14/locidex:0.2.3' }" 
 
     input:
     path input_values // [file(sample1), file(sample2), file(sample3), etc...]

diff --git a/nextflow.config b/nextflow.config
@@ -213,7 +213,7 @@ manifest {
     description     = """Arborator: Genomic Profile Clustering and Summary"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.1.0'
+    version         = '0.2.0'
     doi             = ''
     defaultBranch   = 'main'
 }

diff --git a/tests/data/profiles/S1.mlst.json b/tests/data/profiles/S1.mlst.json
@@ -1,11 +1,25 @@
 {
-  "S1": {
-    "locus_1": 1,
-    "locus_2": 1,
-    "locus_3": "1",
-    "locus_4": "1",
-    "locus_5": "1",
-    "locus_6": 1,
-    "locus_7": 1
+  "db_info": {},
+  "parameters": {
+    "mode": "normal",
+    "min_match_ident": 100,
+    "min_match_cov": 100,
+    "max_ambiguous": 0,
+    "max_internal_stops": 0
+  },
+  "data": {
+    "sample_name": "S1",
+    "profile": {
+      "S1": {
+        "locus_1": 1,
+        "locus_2": 1,
+        "locus_3": "1",
+        "locus_4": "1",
+        "locus_5": "1",
+        "locus_6": 1,
+        "locus_7": 1
+      }
+    },
+    "seq_data": {}
   }
-}
+}
diff --git a/tests/data/profiles/S2.mlst.json b/tests/data/profiles/S2.mlst.json
@@ -1,11 +1,25 @@
 {
-  "S2": {
-    "locus_1": 1,
-    "locus_2": 1,
-    "locus_3": "2",
-    "locus_4": "2",
-    "locus_5": "?",
-    "locus_6": 4,
-    "locus_7": 1
+  "db_info": {},
+  "parameters": {
+    "mode": "normal",
+    "min_match_ident": 100,
+    "min_match_cov": 100,
+    "max_ambiguous": 0,
+    "max_internal_stops": 0
+  },
+  "data": {
+    "sample_name": "S2",
+    "profile": {
+      "S2": {
+        "locus_1": 1,
+        "locus_2": 1,
+        "locus_3": "2",
+        "locus_4": "2",
+        "locus_5": "?",
+        "locus_6": 4,
+        "locus_7": 1
+      }
+    },
+    "seq_data": {}
   }
-}
+}
diff --git a/tests/data/profiles/S3.mlst.json b/tests/data/profiles/S3.mlst.json
@@ -1,11 +1,25 @@
 {
-  "S3": {
-    "locus_1": 1,
-    "locus_2": 2,
-    "locus_3": "2",
-    "locus_4": "2",
-    "locus_5": "1",
-    "locus_6": 5,
-    "locus_7": 1
+  "db_info": {},
+  "parameters": {
+    "mode": "normal",
+    "min_match_ident": 100,
+    "min_match_cov": 100,
+    "max_ambiguous": 0,
+    "max_internal_stops": 0
+  },
+  "data": {
+    "sample_name": "S3",
+    "profile": {
+      "S3": {
+        "locus_1": 1,
+        "locus_2": 2,
+        "locus_3": "2",
+        "locus_4": "2",
+        "locus_5": "1",
+        "locus_6": 5,
+        "locus_7": 1
+      }
+    },
+    "seq_data": {}
   }
-}
+}
diff --git a/tests/data/profiles/S4.mlst.json b/tests/data/profiles/S4.mlst.json
@@ -1,11 +1,25 @@
 {
-  "S4": {
-    "locus_1": 1,
-    "locus_2": 2,
-    "locus_3": "3",
-    "locus_4": "2",
-    "locus_5": "1",
-    "locus_6": 6,
-    "locus_7": 1
+  "db_info": {},
+  "parameters": {
+    "mode": "normal",
+    "min_match_ident": 100,
+    "min_match_cov": 100,
+    "max_ambiguous": 0,
+    "max_internal_stops": 0
+  },
+  "data": {
+    "sample_name": "S4",
+    "profile": {
+      "S4": {
+        "locus_1": 1,
+        "locus_2": 2,
+        "locus_3": "3",
+        "locus_4": "2",
+        "locus_5": "1",
+        "locus_6": 6,
+        "locus_7": 1
+      }
+    },
+    "seq_data": {}
   }
-}
+}
diff --git a/tests/data/profiles/S5.mlst.json b/tests/data/profiles/S5.mlst.json
@@ -1,11 +1,25 @@
 {
-  "S5": {
-    "locus_1": 1,
-    "locus_2": 2,
-    "locus_3": "?",
-    "locus_4": "2",
-    "locus_5": "1",
-    "locus_6": 8,
-    "locus_7": 1
+  "db_info": {},
+  "parameters": {
+    "mode": "normal",
+    "min_match_ident": 100,
+    "min_match_cov": 100,
+    "max_ambiguous": 0,
+    "max_internal_stops": 0
+  },
+  "data": {
+    "sample_name": "S5",
+    "profile": {
+      "S5": {
+        "locus_1": 1,
+        "locus_2": 2,
+        "locus_3": "?",
+        "locus_4": "2",
+        "locus_5": "1",
+        "locus_6": 8,
+        "locus_7": 1
+      }
+    },
+    "seq_data": {}
   }
-}
+}
diff --git a/tests/data/profiles/S6.mlst.json b/tests/data/profiles/S6.mlst.json
@@ -1,11 +1,25 @@
 {
-  "S6": {
-    "locus_1": 2,
-    "locus_2": 3,
-    "locus_3": "3",
-    "locus_4": "-",
-    "locus_5": "?",
-    "locus_6": 9,
-    "locus_7": 0
+  "db_info": {},
+  "parameters": {
+    "mode": "normal",
+    "min_match_ident": 100,
+    "min_match_cov": 100,
+    "max_ambiguous": 0,
+    "max_internal_stops": 0
+  },
+  "data": {
+    "sample_name": "S6",
+    "profile": {
+      "S6": {
+        "locus_1": 2,
+        "locus_2": 3,
+        "locus_3": "3",
+        "locus_4": "-",
+        "locus_5": "?",
+        "locus_6": 9,
+        "locus_7": 0
+      }
+    },
+    "seq_data": {}
   }
-}
+}