Merge branch 'nf-test_assertions' into nf-test_assertions_new

nf-core · Aug 5, 2024 · 56fdc11 · 56fdc11
2 parents 76ce8da + 3487024
commit 56fdc11
Show file tree

Hide file tree

Showing 20 changed files with 60 additions and 50 deletions.
diff --git a/bin/check_samplesheet_create_tables.py b/bin/check_samplesheet_create_tables.py
@@ -242,7 +242,7 @@ def process_samplesheet(args):
                 + "'nextflow run metapep -profile <YOURPROFILE> --outdir <OUTDIR> --show_supported_models"
             )
 
-    alleles = pd.DataFrame({"allele_name": list(unique_alleles)})
+    alleles = pd.DataFrame({"allele_name": sorted(list(unique_alleles))})
     alleles["allele_id"] = range(len(alleles))
     alleles[["allele_id", "allele_name"]].to_csv(args.alleles, sep="\t", index=False)
 

diff --git a/bin/concat_tsv.py b/bin/concat_tsv.py
@@ -2,6 +2,8 @@
 
 import argparse
 import sys
+from gzip import GzipFile
+from io import TextIOWrapper
 
 import pandas as pd
 
@@ -28,35 +30,39 @@ def parse_args(args=None):
     return parser.parse_args()
 
 
+# Peptide Predictions remain unsorted, otherwise full dataframe needs to be loaded to memory, which would be computational very intensive.
+
 def main(args=None):
     args = parse_args(args)
 
-    first_header = pd.DataFrame().columns
-    for i, filename in enumerate(args.input):
-        print("Processing file: ", filename, flush=True)
-
-        # Read input file chunk-wise
-        with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader:
-            for j, tsv_chunk in enumerate(reader):
-                print(" Chunk: ", j, flush=True)
-                if i == 0 and j == 0:
-                    first_header = tsv_chunk.columns
-                    print("Header: ", first_header.tolist(), flush=True)
-                    tsv_chunk.to_csv(args.output, mode="w", sep="\t", index=False, header=True)
-                else:
-                    if j == 0:
-                        # Check if header of subsequent input files match header of first input file
-                        # (column order must be the same)
-                        if tsv_chunk.columns.tolist() != first_header.tolist():
-                            print(
-                                "ERROR - header of input file",
-                                filename,
-                                "does not match the header of the first input file!",
-                                file=sys.stderr,
-                            )
-                            sys.exit(1)
-
-                    tsv_chunk.to_csv(args.output, mode="a", sep="\t", index=False, header=False)
+    with TextIOWrapper(GzipFile(args.output, 'w', mtime=0), encoding='utf-8') as outfile:
+
+        first_header = pd.DataFrame().columns
+        for i, filename in enumerate(args.input):
+            print("Processing file: ", filename, flush=True)
+
+            # Read input file chunk-wise
+            with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader:
+                for j, tsv_chunk in enumerate(reader):
+                    print(" Chunk: ", j, flush=True)
+                    if i == 0 and j == 0:
+                        first_header = tsv_chunk.columns
+                        print("Header: ", first_header.tolist(), flush=True)
+                        tsv_chunk.to_csv(outfile, mode="w", sep="\t", index=False, header=True)
+                    else:
+                        if j == 0:
+                            # Check if header of subsequent input files match header of first input file
+                            # (column order must be the same)
+                            if tsv_chunk.columns.tolist() != first_header.tolist():
+                                print(
+                                    "ERROR - header of input file",
+                                    filename,
+                                    "does not match the header of the first input file!",
+                                    file=sys.stderr,
+                                )
+                                sys.exit(1)
+
+                        tsv_chunk.to_csv(outfile, mode="a", sep="\t", index=False, header=False)
 
 
 if __name__ == "__main__":

diff --git a/bin/download_proteins_entrez.py b/bin/download_proteins_entrez.py
@@ -142,12 +142,12 @@ def main(args=None):
                     + "It needs to be a tsv file containing 'taxon_id' and/or optionally 'assembly_id' and 'abundance."
                 )
 
-    taxIds = list(set(taxIds))
+    taxIds = sorted(list(set(taxIds)))
     ####################################################################################################
     # Process TaxIDs
 
     print("Processing the following taxonomy IDs:")
-    print(taxIds)
+    print(sorted(taxIds))
 
     ####################################################################################################
     # 0) Check if the taxids link to a strain level organism
@@ -307,7 +307,7 @@ def main(args=None):
     # some proteins, such as 487413233, occur within multiple sequences of the assembly!
     # -> assembly only listed once!
 
-    proteinIds = list(dict_proteinId_assemblyIds.keys())
+    proteinIds = sorted(list(dict_proteinId_assemblyIds.keys()))
 
     print("# proteins (unique): ", len(proteinIds))
     # -> # proteins with refseq source (<= # IPG proteins)
@@ -394,7 +394,7 @@ def main(args=None):
     for proteinId in proteinIds:
         accVersion = dict_protein_uid_acc[proteinId]
         # write out protein_tmp_id, entity_name (taxon_id)
-        for assemblyId in dict_proteinId_assemblyIds[proteinId]:
+        for assemblyId in sorted(dict_proteinId_assemblyIds[proteinId]):
             taxId = dict_assemblyId_taxId[assemblyId]
             print(accVersion, taxId, sep="\t", file=args.entities_proteins, flush=True)
 

diff --git a/bin/fasta_to_tsv.py b/bin/fasta_to_tsv.py
@@ -14,9 +14,13 @@
 )
 args = parser.parse_args()
 
+records_out = []
 with gzip.open(args.input, "rt") as handle:
     for record in SeqIO.parse(handle, "fasta"):
         if args.remove_asterisk and record.seq[-1] == "*":
-            print(f"{record.id}\t{record.seq[:-1]}")
+            records_out.append([str(record.id),"\t",str(record.seq[:-1]),"\n"])
         else:
-            print(f"{record.id}\t{record.seq}")
+            records_out.append([str(record.id),"\t",str(record.seq),"\n"])
+# Two dimensional array to enable sorting
+records_out = sorted(records_out, key=lambda x: x[0])
+print("".join(["".join(rec) for rec in records_out]))
diff --git a/bin/generate_peptides.py b/bin/generate_peptides.py
@@ -159,7 +159,7 @@ def main(args=None):
                 results["count"] = pd.to_numeric(results["count"], downcast="unsigned")
                 # prepare df for joining
                 results.set_index("peptide_sequence", inplace=True)
-                results.sort_index(inplace=True)
+                results.sort_index(inplace=True, kind="stable")
 
                 unique_peptides = pd.DataFrame(index=results.index.drop_duplicates())
                 unique_peptides["peptide_id"] = range(id_counter, id_counter + len(unique_peptides))

diff --git a/bin/generate_protein_and_entity_ids.py b/bin/generate_protein_and_entity_ids.py
@@ -150,7 +150,7 @@ def main(args=None):
                         # (i.e. microbiome_bare_id) and assign the corresponding microbiome_id to the entities
                         for microbiome_id in microbiomes.groupby("microbiome_bare_id").get_group(microbiome_bare_id)[
                             "microbiome_id"
-                        ]:
+                        ].sort_values():
                             entities = pd.DataFrame()
                             entities = proteins[["entity_name"]].drop_duplicates()
                             entities["entity_id"] = range(next_entity_id, next_entity_id + len(entities))

diff --git a/bin/unify_model_lengths.py b/bin/unify_model_lengths.py
@@ -68,7 +68,7 @@ def main():
     samplesheet = pd.read_csv(args.input)
 
     # Retrieve unique list of alleles
-    alleles_s = {allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")}
+    alleles_s = sorted({allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")})
     log_str += f"Found the following alleles: {', '.join(alleles_s)}\n\n"
     # Parse alleles to epytope convention
     predictor = EpitopePredictorFactory(args.method)

diff --git a/tests/pipeline/test.nf.test.snap b/tests/pipeline/test.nf.test.snap
@@ -33,4 +33,4 @@
         },
         "timestamp": "2024-06-26T08:14:54+0000"
     }
-}
+}
diff --git a/tests/pipeline/test_all.nf.test.snap b/tests/pipeline/test_all.nf.test.snap
@@ -44,4 +44,4 @@
         },
         "timestamp": "2024-07-17T15:58:41.467969153"
     }
-}
+}
diff --git a/tests/pipeline/test_assembly_only.nf.test.snap b/tests/pipeline/test_assembly_only.nf.test.snap
@@ -33,4 +33,4 @@
         },
         "timestamp": "2024-06-26T08:14:54+0000"
     }
-}
+}
diff --git a/tests/pipeline/test_bins_only.nf.test.snap b/tests/pipeline/test_bins_only.nf.test.snap
@@ -33,4 +33,4 @@
         },
         "timestamp": "2024-06-26T08:14:54+0000"
     }
-}
+}
diff --git a/tests/pipeline/test_coassembly.nf.test.snap b/tests/pipeline/test_coassembly.nf.test.snap
@@ -33,4 +33,4 @@
         },
         "timestamp": "2024-06-26T08:14:54+0000"
     }
-}
+}
diff --git a/tests/pipeline/test_mhcflurry.nf.test.snap b/tests/pipeline/test_mhcflurry.nf.test.snap
@@ -32,4 +32,4 @@
         },
         "timestamp": "2024-06-26T08:14:54+0000"
     }
-}
+}
diff --git a/tests/pipeline/test_mhcnuggets_1.nf.test.snap b/tests/pipeline/test_mhcnuggets_1.nf.test.snap
@@ -32,4 +32,4 @@
         },
         "timestamp": "2024-06-26T08:14:54+0000"
     }
-}
+}
diff --git a/tests/pipeline/test_mhcnuggets_2.nf.test.snap b/tests/pipeline/test_mhcnuggets_2.nf.test.snap
@@ -32,4 +32,4 @@
         },
         "timestamp": "2024-06-26T08:14:54+0000"
     }
-}
+}
diff --git a/tests/pipeline/test_mouse.nf.test.snap b/tests/pipeline/test_mouse.nf.test.snap
@@ -44,4 +44,4 @@
         },
         "timestamp": "2024-07-18T05:18:07.866546467"
     }
-}
+}
diff --git a/tests/pipeline/test_mouse_all_pep_lengths.nf.test.snap b/tests/pipeline/test_mouse_all_pep_lengths.nf.test.snap
@@ -32,4 +32,4 @@
         },
         "timestamp": "2024-06-26T08:14:54+0000"
     }
-}
+}
diff --git a/tests/pipeline/test_taxa_only.nf.test.snap b/tests/pipeline/test_taxa_only.nf.test.snap
@@ -44,4 +44,4 @@
         },
         "timestamp": "2024-07-18T14:49:36.871618584"
     }
-}
+}
diff --git a/tests/pipeline/test_taxa_specific_assembly.nf.test.snap b/tests/pipeline/test_taxa_specific_assembly.nf.test.snap
@@ -44,4 +44,4 @@
         },
         "timestamp": "2024-07-17T16:08:40.670649421"
     }
-}
+}
diff --git a/workflows/metapep.nf b/workflows/metapep.nf
@@ -216,8 +216,8 @@ workflow METAPEP {
         ch_merge_predictions_input_warn = MERGE_PREDICTIONS_BUFFER.out.ch_prediction_warnings_merged_buffer.mix(ch_predictions_unbuffered.warnings)
 
         MERGE_PREDICTIONS (
-            ch_merge_predictions_input_pred.collect(),
-            ch_merge_predictions_input_warn.collect()
+            ch_merge_predictions_input_pred.collect(sort: { it.baseName }),
+            ch_merge_predictions_input_warn.collect(sort: { it.baseName })
         )
         ch_versions = ch_versions.mix(MERGE_PREDICTIONS.out.versions)