diff --git a/bin/check_samplesheet_create_tables.py b/bin/check_samplesheet_create_tables.py index 7dca49ff..b62e2685 100755 --- a/bin/check_samplesheet_create_tables.py +++ b/bin/check_samplesheet_create_tables.py @@ -242,7 +242,7 @@ def process_samplesheet(args): + "'nextflow run metapep -profile --outdir --show_supported_models" ) - alleles = pd.DataFrame({"allele_name": list(unique_alleles)}) + alleles = pd.DataFrame({"allele_name": sorted(list(unique_alleles))}) alleles["allele_id"] = range(len(alleles)) alleles[["allele_id", "allele_name"]].to_csv(args.alleles, sep="\t", index=False) diff --git a/bin/concat_tsv.py b/bin/concat_tsv.py index 174a2612..bc82c8fe 100755 --- a/bin/concat_tsv.py +++ b/bin/concat_tsv.py @@ -2,6 +2,8 @@ import argparse import sys +from gzip import GzipFile +from io import TextIOWrapper import pandas as pd @@ -28,35 +30,39 @@ def parse_args(args=None): return parser.parse_args() +# Peptide Predictions remain unsorted, otherwise full dataframe needs to be loaded to memory, which would be computational very intensive. + def main(args=None): args = parse_args(args) - first_header = pd.DataFrame().columns - for i, filename in enumerate(args.input): - print("Processing file: ", filename, flush=True) - - # Read input file chunk-wise - with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader: - for j, tsv_chunk in enumerate(reader): - print(" Chunk: ", j, flush=True) - if i == 0 and j == 0: - first_header = tsv_chunk.columns - print("Header: ", first_header.tolist(), flush=True) - tsv_chunk.to_csv(args.output, mode="w", sep="\t", index=False, header=True) - else: - if j == 0: - # Check if header of subsequent input files match header of first input file - # (column order must be the same) - if tsv_chunk.columns.tolist() != first_header.tolist(): - print( - "ERROR - header of input file", - filename, - "does not match the header of the first input file!", - file=sys.stderr, - ) - sys.exit(1) - - tsv_chunk.to_csv(args.output, mode="a", sep="\t", index=False, header=False) + with TextIOWrapper(GzipFile(args.output, 'w', mtime=0), encoding='utf-8') as outfile: + + first_header = pd.DataFrame().columns + for i, filename in enumerate(args.input): + print("Processing file: ", filename, flush=True) + + # Read input file chunk-wise + with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader: + for j, tsv_chunk in enumerate(reader): + print(" Chunk: ", j, flush=True) + if i == 0 and j == 0: + first_header = tsv_chunk.columns + print("Header: ", first_header.tolist(), flush=True) + tsv_chunk.to_csv(outfile, mode="w", sep="\t", index=False, header=True) + else: + if j == 0: + # Check if header of subsequent input files match header of first input file + # (column order must be the same) + if tsv_chunk.columns.tolist() != first_header.tolist(): + print( + "ERROR - header of input file", + filename, + "does not match the header of the first input file!", + file=sys.stderr, + ) + sys.exit(1) + + tsv_chunk.to_csv(outfile, mode="a", sep="\t", index=False, header=False) if __name__ == "__main__": diff --git a/bin/download_proteins_entrez.py b/bin/download_proteins_entrez.py index 7adbf7a7..d55cad4f 100755 --- a/bin/download_proteins_entrez.py +++ b/bin/download_proteins_entrez.py @@ -142,12 +142,12 @@ def main(args=None): + "It needs to be a tsv file containing 'taxon_id' and/or optionally 'assembly_id' and 'abundance." ) - taxIds = list(set(taxIds)) + taxIds = sorted(list(set(taxIds))) #################################################################################################### # Process TaxIDs print("Processing the following taxonomy IDs:") - print(taxIds) + print(sorted(taxIds)) #################################################################################################### # 0) Check if the taxids link to a strain level organism @@ -307,7 +307,7 @@ def main(args=None): # some proteins, such as 487413233, occur within multiple sequences of the assembly! # -> assembly only listed once! - proteinIds = list(dict_proteinId_assemblyIds.keys()) + proteinIds = sorted(list(dict_proteinId_assemblyIds.keys())) print("# proteins (unique): ", len(proteinIds)) # -> # proteins with refseq source (<= # IPG proteins) @@ -394,7 +394,7 @@ def main(args=None): for proteinId in proteinIds: accVersion = dict_protein_uid_acc[proteinId] # write out protein_tmp_id, entity_name (taxon_id) - for assemblyId in dict_proteinId_assemblyIds[proteinId]: + for assemblyId in sorted(dict_proteinId_assemblyIds[proteinId]): taxId = dict_assemblyId_taxId[assemblyId] print(accVersion, taxId, sep="\t", file=args.entities_proteins, flush=True) diff --git a/bin/fasta_to_tsv.py b/bin/fasta_to_tsv.py index 62dce641..4c780943 100755 --- a/bin/fasta_to_tsv.py +++ b/bin/fasta_to_tsv.py @@ -14,9 +14,13 @@ ) args = parser.parse_args() +records_out = [] with gzip.open(args.input, "rt") as handle: for record in SeqIO.parse(handle, "fasta"): if args.remove_asterisk and record.seq[-1] == "*": - print(f"{record.id}\t{record.seq[:-1]}") + records_out.append([str(record.id),"\t",str(record.seq[:-1]),"\n"]) else: - print(f"{record.id}\t{record.seq}") + records_out.append([str(record.id),"\t",str(record.seq),"\n"]) +# Two dimensional array to enable sorting +records_out = sorted(records_out, key=lambda x: x[0]) +print("".join(["".join(rec) for rec in records_out])) diff --git a/bin/generate_peptides.py b/bin/generate_peptides.py index c398844e..097f7c70 100755 --- a/bin/generate_peptides.py +++ b/bin/generate_peptides.py @@ -159,7 +159,7 @@ def main(args=None): results["count"] = pd.to_numeric(results["count"], downcast="unsigned") # prepare df for joining results.set_index("peptide_sequence", inplace=True) - results.sort_index(inplace=True) + results.sort_index(inplace=True, kind="stable") unique_peptides = pd.DataFrame(index=results.index.drop_duplicates()) unique_peptides["peptide_id"] = range(id_counter, id_counter + len(unique_peptides)) diff --git a/bin/generate_protein_and_entity_ids.py b/bin/generate_protein_and_entity_ids.py index d3122f6b..793fe1c3 100755 --- a/bin/generate_protein_and_entity_ids.py +++ b/bin/generate_protein_and_entity_ids.py @@ -150,7 +150,7 @@ def main(args=None): # (i.e. microbiome_bare_id) and assign the corresponding microbiome_id to the entities for microbiome_id in microbiomes.groupby("microbiome_bare_id").get_group(microbiome_bare_id)[ "microbiome_id" - ]: + ].sort_values(): entities = pd.DataFrame() entities = proteins[["entity_name"]].drop_duplicates() entities["entity_id"] = range(next_entity_id, next_entity_id + len(entities)) diff --git a/bin/unify_model_lengths.py b/bin/unify_model_lengths.py index ae9a9938..ffba09d0 100755 --- a/bin/unify_model_lengths.py +++ b/bin/unify_model_lengths.py @@ -68,7 +68,7 @@ def main(): samplesheet = pd.read_csv(args.input) # Retrieve unique list of alleles - alleles_s = {allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")} + alleles_s = sorted({allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")}) log_str += f"Found the following alleles: {', '.join(alleles_s)}\n\n" # Parse alleles to epytope convention predictor = EpitopePredictorFactory(args.method) diff --git a/tests/pipeline/test.nf.test.snap b/tests/pipeline/test.nf.test.snap index 277ec42c..eded0993 100644 --- a/tests/pipeline/test.nf.test.snap +++ b/tests/pipeline/test.nf.test.snap @@ -33,4 +33,4 @@ }, "timestamp": "2024-06-26T08:14:54+0000" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_all.nf.test.snap b/tests/pipeline/test_all.nf.test.snap index 53b0815f..6f0da13a 100644 --- a/tests/pipeline/test_all.nf.test.snap +++ b/tests/pipeline/test_all.nf.test.snap @@ -44,4 +44,4 @@ }, "timestamp": "2024-07-17T15:58:41.467969153" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_assembly_only.nf.test.snap b/tests/pipeline/test_assembly_only.nf.test.snap index 010f0d24..b9e44f64 100644 --- a/tests/pipeline/test_assembly_only.nf.test.snap +++ b/tests/pipeline/test_assembly_only.nf.test.snap @@ -33,4 +33,4 @@ }, "timestamp": "2024-06-26T08:14:54+0000" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_bins_only.nf.test.snap b/tests/pipeline/test_bins_only.nf.test.snap index a58860c8..3566d8c8 100644 --- a/tests/pipeline/test_bins_only.nf.test.snap +++ b/tests/pipeline/test_bins_only.nf.test.snap @@ -33,4 +33,4 @@ }, "timestamp": "2024-06-26T08:14:54+0000" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_coassembly.nf.test.snap b/tests/pipeline/test_coassembly.nf.test.snap index 78b2790d..3d3b3719 100644 --- a/tests/pipeline/test_coassembly.nf.test.snap +++ b/tests/pipeline/test_coassembly.nf.test.snap @@ -33,4 +33,4 @@ }, "timestamp": "2024-06-26T08:14:54+0000" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_mhcflurry.nf.test.snap b/tests/pipeline/test_mhcflurry.nf.test.snap index 6d914bea..0a6e3cf8 100644 --- a/tests/pipeline/test_mhcflurry.nf.test.snap +++ b/tests/pipeline/test_mhcflurry.nf.test.snap @@ -32,4 +32,4 @@ }, "timestamp": "2024-06-26T08:14:54+0000" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_mhcnuggets_1.nf.test.snap b/tests/pipeline/test_mhcnuggets_1.nf.test.snap index 37453864..7cf89733 100644 --- a/tests/pipeline/test_mhcnuggets_1.nf.test.snap +++ b/tests/pipeline/test_mhcnuggets_1.nf.test.snap @@ -32,4 +32,4 @@ }, "timestamp": "2024-06-26T08:14:54+0000" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_mhcnuggets_2.nf.test.snap b/tests/pipeline/test_mhcnuggets_2.nf.test.snap index 5639403b..a9a02e5a 100644 --- a/tests/pipeline/test_mhcnuggets_2.nf.test.snap +++ b/tests/pipeline/test_mhcnuggets_2.nf.test.snap @@ -32,4 +32,4 @@ }, "timestamp": "2024-06-26T08:14:54+0000" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_mouse.nf.test.snap b/tests/pipeline/test_mouse.nf.test.snap index 50b5264f..02f19085 100644 --- a/tests/pipeline/test_mouse.nf.test.snap +++ b/tests/pipeline/test_mouse.nf.test.snap @@ -44,4 +44,4 @@ }, "timestamp": "2024-07-18T05:18:07.866546467" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_mouse_all_pep_lengths.nf.test.snap b/tests/pipeline/test_mouse_all_pep_lengths.nf.test.snap index e82a0ab2..c2e92dbc 100644 --- a/tests/pipeline/test_mouse_all_pep_lengths.nf.test.snap +++ b/tests/pipeline/test_mouse_all_pep_lengths.nf.test.snap @@ -32,4 +32,4 @@ }, "timestamp": "2024-06-26T08:14:54+0000" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_taxa_only.nf.test.snap b/tests/pipeline/test_taxa_only.nf.test.snap index 88e60291..813eca0e 100644 --- a/tests/pipeline/test_taxa_only.nf.test.snap +++ b/tests/pipeline/test_taxa_only.nf.test.snap @@ -44,4 +44,4 @@ }, "timestamp": "2024-07-18T14:49:36.871618584" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_taxa_specific_assembly.nf.test.snap b/tests/pipeline/test_taxa_specific_assembly.nf.test.snap index fe8ec9f7..ddb1ec49 100644 --- a/tests/pipeline/test_taxa_specific_assembly.nf.test.snap +++ b/tests/pipeline/test_taxa_specific_assembly.nf.test.snap @@ -44,4 +44,4 @@ }, "timestamp": "2024-07-17T16:08:40.670649421" } -} \ No newline at end of file +} diff --git a/workflows/metapep.nf b/workflows/metapep.nf index 794bfab6..7496f9e3 100644 --- a/workflows/metapep.nf +++ b/workflows/metapep.nf @@ -216,8 +216,8 @@ workflow METAPEP { ch_merge_predictions_input_warn = MERGE_PREDICTIONS_BUFFER.out.ch_prediction_warnings_merged_buffer.mix(ch_predictions_unbuffered.warnings) MERGE_PREDICTIONS ( - ch_merge_predictions_input_pred.collect(), - ch_merge_predictions_input_warn.collect() + ch_merge_predictions_input_pred.collect(sort: { it.baseName }), + ch_merge_predictions_input_warn.collect(sort: { it.baseName }) ) ch_versions = ch_versions.mix(MERGE_PREDICTIONS.out.versions)