fix sorting and gzip diffs between runs

nf-core · Jul 10, 2024 · 9928067 · 9928067
1 parent 4641a59
commit 9928067
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 34 deletions.
diff --git a/bin/concat_tsv.py b/bin/concat_tsv.py
@@ -2,6 +2,8 @@
 
 import argparse
 import sys
+from gzip import GzipFile
+from io import TextIOWrapper
 
 import pandas as pd
 
@@ -28,35 +30,39 @@ def parse_args(args=None):
     return parser.parse_args()
 
 
+# Peptide Predictions remain unsorted, otherwise full dataframe needs to be loaded to memory, which would be computational very intensive.
+
 def main(args=None):
     args = parse_args(args)
 
-    first_header = pd.DataFrame().columns
-    for i, filename in enumerate(args.input):
-        print("Processing file: ", filename, flush=True)
-
-        # Read input file chunk-wise
-        with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader:
-            for j, tsv_chunk in enumerate(reader):
-                print(" Chunk: ", j, flush=True)
-                if i == 0 and j == 0:
-                    first_header = tsv_chunk.columns
-                    print("Header: ", first_header.tolist(), flush=True)
-                    tsv_chunk.to_csv(args.output, mode="w", sep="\t", index=False, header=True)
-                else:
-                    if j == 0:
-                        # Check if header of subsequent input files match header of first input file
-                        # (column order must be the same)
-                        if tsv_chunk.columns.tolist() != first_header.tolist():
-                            print(
-                                "ERROR - header of input file",
-                                filename,
-                                "does not match the header of the first input file!",
-                                file=sys.stderr,
-                            )
-                            sys.exit(1)
-
-                    tsv_chunk.to_csv(args.output, mode="a", sep="\t", index=False, header=False)
+    with TextIOWrapper(GzipFile(args.output, 'w', mtime=0), encoding='utf-8') as outfile:
+
+        first_header = pd.DataFrame().columns
+        for i, filename in enumerate(args.input):
+            print("Processing file: ", filename, flush=True)
+
+            # Read input file chunk-wise
+            with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader:
+                for j, tsv_chunk in enumerate(reader):
+                    print(" Chunk: ", j, flush=True)
+                    if i == 0 and j == 0:
+                        first_header = tsv_chunk.columns
+                        print("Header: ", first_header.tolist(), flush=True)
+                        tsv_chunk.to_csv(outfile, mode="w", sep="\t", index=False, header=True)
+                    else:
+                        if j == 0:
+                            # Check if header of subsequent input files match header of first input file
+                            # (column order must be the same)
+                            if tsv_chunk.columns.tolist() != first_header.tolist():
+                                print(
+                                    "ERROR - header of input file",
+                                    filename,
+                                    "does not match the header of the first input file!",
+                                    file=sys.stderr,
+                                )
+                                sys.exit(1)
+
+                        tsv_chunk.to_csv(outfile, mode="a", sep="\t", index=False, header=False)
 
 
 if __name__ == "__main__":

diff --git a/bin/generate_peptides.py b/bin/generate_peptides.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 
 import argparse
-import gzip
+from gzip import GzipFile
+from io import TextIOWrapper
 import sys
 
 import pandas as pd
@@ -126,7 +127,7 @@ def main(args=None):
 
     ####################
     # generate peptides
-    with gzip.open(args.peptides, "wt") as pep_handle:
+    with TextIOWrapper(GzipFile(args.peptides, 'w', mtime=0), encoding='utf-8') as pep_handle:
         print_header = True
         id_counter = 0
 

diff --git a/bin/generate_protein_and_entity_ids.py b/bin/generate_protein_and_entity_ids.py
@@ -6,7 +6,8 @@
 # proteins from 'proteins' input type: not known if unique or not, handle separately for now (in case of unique ids this causes unnecessary redundancy; could add parameter for this in future)
 
 import argparse
-import gzip
+from gzip import GzipFile
+from io import TextIOWrapper
 import sys
 
 import pandas as pd
@@ -92,7 +93,7 @@ def main(args=None):
     entities_proteins_columns = ["entity_id", "protein_id"]
     entities_columns = ["entity_id", "entity_name"]
     microbiomes_entities_columns = ["microbiome_id", "entity_id"]
-    with gzip.open(args.out_proteins, "wt") as outfile_proteins, open(
+    with TextIOWrapper(GzipFile(args.out_proteins, 'w', mtime=0), encoding='utf-8') as outfile_proteins, open(
         args.out_entities_proteins, "w"
     ) as outfile_entities_proteins, open(args.out_entities, "w") as outfile_entities, open(
         args.out_microbiomes_entities, "w"
@@ -170,6 +171,7 @@ def main(args=None):
 
                     # Write proteins
                     proteins.rename(columns={"protein_tmp_id": "protein_orig_id"}, inplace=True)
+                    proteins = proteins.sort_values(by="protein_orig_id").drop("protein_id", axis=1).reset_index(drop=True).reset_index(names="protein_id")
                     proteins[proteins_columns].to_csv(outfile_proteins, sep="\t", header=False, index=False)
                     # Write entities_proteins
                     proteins.merge(entities)[["entity_id", "protein_id"]].drop_duplicates().to_csv(
@@ -210,12 +212,14 @@ def main(args=None):
             next_entity_id += len(entities)
 
             # Write proteins
-            proteins.rename(columns={"protein_tmp_id": "protein_orig_id"})[proteins_columns].to_csv(
+            proteins.rename(columns={"protein_tmp_id": "protein_orig_id"}, inplace=True)
+            proteins = proteins.sort_values(by="protein_orig_id").drop("protein_id", axis=1).reset_index(drop=True).reset_index(names="protein_id")
+            proteins[proteins_columns].to_csv(
                 outfile_proteins, sep="\t", header=False, index=False
             )
 
             entities_microbiomes_proteins = (
-                entities_proteins.merge(proteins)
+                entities_proteins.merge(proteins.rename(columns={"protein_orig_id":"protein_tmp_id"}))
                 .merge(entities)
                 .merge(microbiomes_entities)[["entity_id", "protein_id", "microbiome_id", "entity_weight"]]
             )
@@ -231,6 +235,5 @@ def main(args=None):
                 outfile_microbiomes_entities, sep="\t", header=False, index=False
             )
 
-
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/bin/unify_model_lengths.py b/bin/unify_model_lengths.py
@@ -68,7 +68,7 @@ def main():
     samplesheet = pd.read_csv(args.input)
 
     # Retrieve unique list of alleles
-    alleles_s = {allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")}
+    alleles_s = sorted({allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")})
     log_str += f"Found the following alleles: {', '.join(alleles_s)}\n\n"
     # Parse alleles to epytope convention
     predictor = EpitopePredictorFactory(args.method)