Skip to content

Commit

Permalink
fix sorting and gzip diffs between runs
Browse files Browse the repository at this point in the history
  • Loading branch information
tillenglert committed Jul 10, 2024
1 parent 4641a59 commit 9928067
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 34 deletions.
58 changes: 32 additions & 26 deletions bin/concat_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import argparse
import sys
from gzip import GzipFile
from io import TextIOWrapper

import pandas as pd

Expand All @@ -28,35 +30,39 @@ def parse_args(args=None):
return parser.parse_args()


# Peptide Predictions remain unsorted, otherwise full dataframe needs to be loaded to memory, which would be computational very intensive.

def main(args=None):
args = parse_args(args)

first_header = pd.DataFrame().columns
for i, filename in enumerate(args.input):
print("Processing file: ", filename, flush=True)

# Read input file chunk-wise
with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader:
for j, tsv_chunk in enumerate(reader):
print(" Chunk: ", j, flush=True)
if i == 0 and j == 0:
first_header = tsv_chunk.columns
print("Header: ", first_header.tolist(), flush=True)
tsv_chunk.to_csv(args.output, mode="w", sep="\t", index=False, header=True)
else:
if j == 0:
# Check if header of subsequent input files match header of first input file
# (column order must be the same)
if tsv_chunk.columns.tolist() != first_header.tolist():
print(
"ERROR - header of input file",
filename,
"does not match the header of the first input file!",
file=sys.stderr,
)
sys.exit(1)

tsv_chunk.to_csv(args.output, mode="a", sep="\t", index=False, header=False)
with TextIOWrapper(GzipFile(args.output, 'w', mtime=0), encoding='utf-8') as outfile:

first_header = pd.DataFrame().columns
for i, filename in enumerate(args.input):
print("Processing file: ", filename, flush=True)

# Read input file chunk-wise
with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader:
for j, tsv_chunk in enumerate(reader):
print(" Chunk: ", j, flush=True)
if i == 0 and j == 0:
first_header = tsv_chunk.columns
print("Header: ", first_header.tolist(), flush=True)
tsv_chunk.to_csv(outfile, mode="w", sep="\t", index=False, header=True)
else:
if j == 0:
# Check if header of subsequent input files match header of first input file
# (column order must be the same)
if tsv_chunk.columns.tolist() != first_header.tolist():
print(
"ERROR - header of input file",
filename,
"does not match the header of the first input file!",
file=sys.stderr,
)
sys.exit(1)

tsv_chunk.to_csv(outfile, mode="a", sep="\t", index=False, header=False)


if __name__ == "__main__":
Expand Down
5 changes: 3 additions & 2 deletions bin/generate_peptides.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/usr/bin/env python3

import argparse
import gzip
from gzip import GzipFile
from io import TextIOWrapper
import sys

import pandas as pd
Expand Down Expand Up @@ -126,7 +127,7 @@ def main(args=None):

####################
# generate peptides
with gzip.open(args.peptides, "wt") as pep_handle:
with TextIOWrapper(GzipFile(args.peptides, 'w', mtime=0), encoding='utf-8') as pep_handle:
print_header = True
id_counter = 0

Expand Down
13 changes: 8 additions & 5 deletions bin/generate_protein_and_entity_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
# proteins from 'proteins' input type: not known if unique or not, handle separately for now (in case of unique ids this causes unnecessary redundancy; could add parameter for this in future)

import argparse
import gzip
from gzip import GzipFile
from io import TextIOWrapper
import sys

import pandas as pd
Expand Down Expand Up @@ -92,7 +93,7 @@ def main(args=None):
entities_proteins_columns = ["entity_id", "protein_id"]
entities_columns = ["entity_id", "entity_name"]
microbiomes_entities_columns = ["microbiome_id", "entity_id"]
with gzip.open(args.out_proteins, "wt") as outfile_proteins, open(
with TextIOWrapper(GzipFile(args.out_proteins, 'w', mtime=0), encoding='utf-8') as outfile_proteins, open(
args.out_entities_proteins, "w"
) as outfile_entities_proteins, open(args.out_entities, "w") as outfile_entities, open(
args.out_microbiomes_entities, "w"
Expand Down Expand Up @@ -170,6 +171,7 @@ def main(args=None):

# Write proteins
proteins.rename(columns={"protein_tmp_id": "protein_orig_id"}, inplace=True)
proteins = proteins.sort_values(by="protein_orig_id").drop("protein_id", axis=1).reset_index(drop=True).reset_index(names="protein_id")
proteins[proteins_columns].to_csv(outfile_proteins, sep="\t", header=False, index=False)
# Write entities_proteins
proteins.merge(entities)[["entity_id", "protein_id"]].drop_duplicates().to_csv(
Expand Down Expand Up @@ -210,12 +212,14 @@ def main(args=None):
next_entity_id += len(entities)

# Write proteins
proteins.rename(columns={"protein_tmp_id": "protein_orig_id"})[proteins_columns].to_csv(
proteins.rename(columns={"protein_tmp_id": "protein_orig_id"}, inplace=True)
proteins = proteins.sort_values(by="protein_orig_id").drop("protein_id", axis=1).reset_index(drop=True).reset_index(names="protein_id")
proteins[proteins_columns].to_csv(
outfile_proteins, sep="\t", header=False, index=False
)

entities_microbiomes_proteins = (
entities_proteins.merge(proteins)
entities_proteins.merge(proteins.rename(columns={"protein_orig_id":"protein_tmp_id"}))
.merge(entities)
.merge(microbiomes_entities)[["entity_id", "protein_id", "microbiome_id", "entity_weight"]]
)
Expand All @@ -231,6 +235,5 @@ def main(args=None):
outfile_microbiomes_entities, sep="\t", header=False, index=False
)


if __name__ == "__main__":
sys.exit(main())
2 changes: 1 addition & 1 deletion bin/unify_model_lengths.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def main():
samplesheet = pd.read_csv(args.input)

# Retrieve unique list of alleles
alleles_s = {allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")}
alleles_s = sorted({allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")})
log_str += f"Found the following alleles: {', '.join(alleles_s)}\n\n"
# Parse alleles to epytope convention
predictor = EpitopePredictorFactory(args.method)
Expand Down

0 comments on commit 9928067

Please sign in to comment.