diff --git a/bin/concat_tsv.py b/bin/concat_tsv.py index 174a2612..bc82c8fe 100755 --- a/bin/concat_tsv.py +++ b/bin/concat_tsv.py @@ -2,6 +2,8 @@ import argparse import sys +from gzip import GzipFile +from io import TextIOWrapper import pandas as pd @@ -28,35 +30,39 @@ def parse_args(args=None): return parser.parse_args() +# Peptide Predictions remain unsorted, otherwise full dataframe needs to be loaded to memory, which would be computational very intensive. + def main(args=None): args = parse_args(args) - first_header = pd.DataFrame().columns - for i, filename in enumerate(args.input): - print("Processing file: ", filename, flush=True) - - # Read input file chunk-wise - with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader: - for j, tsv_chunk in enumerate(reader): - print(" Chunk: ", j, flush=True) - if i == 0 and j == 0: - first_header = tsv_chunk.columns - print("Header: ", first_header.tolist(), flush=True) - tsv_chunk.to_csv(args.output, mode="w", sep="\t", index=False, header=True) - else: - if j == 0: - # Check if header of subsequent input files match header of first input file - # (column order must be the same) - if tsv_chunk.columns.tolist() != first_header.tolist(): - print( - "ERROR - header of input file", - filename, - "does not match the header of the first input file!", - file=sys.stderr, - ) - sys.exit(1) - - tsv_chunk.to_csv(args.output, mode="a", sep="\t", index=False, header=False) + with TextIOWrapper(GzipFile(args.output, 'w', mtime=0), encoding='utf-8') as outfile: + + first_header = pd.DataFrame().columns + for i, filename in enumerate(args.input): + print("Processing file: ", filename, flush=True) + + # Read input file chunk-wise + with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader: + for j, tsv_chunk in enumerate(reader): + print(" Chunk: ", j, flush=True) + if i == 0 and j == 0: + first_header = tsv_chunk.columns + print("Header: ", first_header.tolist(), flush=True) + tsv_chunk.to_csv(outfile, mode="w", sep="\t", index=False, header=True) + else: + if j == 0: + # Check if header of subsequent input files match header of first input file + # (column order must be the same) + if tsv_chunk.columns.tolist() != first_header.tolist(): + print( + "ERROR - header of input file", + filename, + "does not match the header of the first input file!", + file=sys.stderr, + ) + sys.exit(1) + + tsv_chunk.to_csv(outfile, mode="a", sep="\t", index=False, header=False) if __name__ == "__main__": diff --git a/bin/generate_peptides.py b/bin/generate_peptides.py index c398844e..dbb93dfb 100755 --- a/bin/generate_peptides.py +++ b/bin/generate_peptides.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 import argparse -import gzip +from gzip import GzipFile +from io import TextIOWrapper import sys import pandas as pd @@ -126,7 +127,7 @@ def main(args=None): #################### # generate peptides - with gzip.open(args.peptides, "wt") as pep_handle: + with TextIOWrapper(GzipFile(args.peptides, 'w', mtime=0), encoding='utf-8') as pep_handle: print_header = True id_counter = 0 diff --git a/bin/generate_protein_and_entity_ids.py b/bin/generate_protein_and_entity_ids.py index d3122f6b..e5f07dc0 100755 --- a/bin/generate_protein_and_entity_ids.py +++ b/bin/generate_protein_and_entity_ids.py @@ -6,7 +6,8 @@ # proteins from 'proteins' input type: not known if unique or not, handle separately for now (in case of unique ids this causes unnecessary redundancy; could add parameter for this in future) import argparse -import gzip +from gzip import GzipFile +from io import TextIOWrapper import sys import pandas as pd @@ -92,7 +93,7 @@ def main(args=None): entities_proteins_columns = ["entity_id", "protein_id"] entities_columns = ["entity_id", "entity_name"] microbiomes_entities_columns = ["microbiome_id", "entity_id"] - with gzip.open(args.out_proteins, "wt") as outfile_proteins, open( + with TextIOWrapper(GzipFile(args.out_proteins, 'w', mtime=0), encoding='utf-8') as outfile_proteins, open( args.out_entities_proteins, "w" ) as outfile_entities_proteins, open(args.out_entities, "w") as outfile_entities, open( args.out_microbiomes_entities, "w" @@ -170,6 +171,7 @@ def main(args=None): # Write proteins proteins.rename(columns={"protein_tmp_id": "protein_orig_id"}, inplace=True) + proteins = proteins.sort_values(by="protein_orig_id").drop("protein_id", axis=1).reset_index(drop=True).reset_index(names="protein_id") proteins[proteins_columns].to_csv(outfile_proteins, sep="\t", header=False, index=False) # Write entities_proteins proteins.merge(entities)[["entity_id", "protein_id"]].drop_duplicates().to_csv( @@ -210,12 +212,14 @@ def main(args=None): next_entity_id += len(entities) # Write proteins - proteins.rename(columns={"protein_tmp_id": "protein_orig_id"})[proteins_columns].to_csv( + proteins.rename(columns={"protein_tmp_id": "protein_orig_id"}, inplace=True) + proteins = proteins.sort_values(by="protein_orig_id").drop("protein_id", axis=1).reset_index(drop=True).reset_index(names="protein_id") + proteins[proteins_columns].to_csv( outfile_proteins, sep="\t", header=False, index=False ) entities_microbiomes_proteins = ( - entities_proteins.merge(proteins) + entities_proteins.merge(proteins.rename(columns={"protein_orig_id":"protein_tmp_id"})) .merge(entities) .merge(microbiomes_entities)[["entity_id", "protein_id", "microbiome_id", "entity_weight"]] ) @@ -231,6 +235,5 @@ def main(args=None): outfile_microbiomes_entities, sep="\t", header=False, index=False ) - if __name__ == "__main__": sys.exit(main()) diff --git a/bin/unify_model_lengths.py b/bin/unify_model_lengths.py index ae9a9938..ffba09d0 100755 --- a/bin/unify_model_lengths.py +++ b/bin/unify_model_lengths.py @@ -68,7 +68,7 @@ def main(): samplesheet = pd.read_csv(args.input) # Retrieve unique list of alleles - alleles_s = {allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")} + alleles_s = sorted({allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")}) log_str += f"Found the following alleles: {', '.join(alleles_s)}\n\n" # Parse alleles to epytope convention predictor = EpitopePredictorFactory(args.method)