Skip to content

Commit

Permalink
Merge branch 'nf-test_assertions' into nf-test_assertions_new
Browse files Browse the repository at this point in the history
  • Loading branch information
tillenglert committed Aug 5, 2024
2 parents 76ce8da + 3487024 commit 56fdc11
Show file tree
Hide file tree
Showing 20 changed files with 60 additions and 50 deletions.
2 changes: 1 addition & 1 deletion bin/check_samplesheet_create_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def process_samplesheet(args):
+ "'nextflow run metapep -profile <YOURPROFILE> --outdir <OUTDIR> --show_supported_models"
)

alleles = pd.DataFrame({"allele_name": list(unique_alleles)})
alleles = pd.DataFrame({"allele_name": sorted(list(unique_alleles))})
alleles["allele_id"] = range(len(alleles))
alleles[["allele_id", "allele_name"]].to_csv(args.alleles, sep="\t", index=False)

Expand Down
58 changes: 32 additions & 26 deletions bin/concat_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import argparse
import sys
from gzip import GzipFile
from io import TextIOWrapper

import pandas as pd

Expand All @@ -28,35 +30,39 @@ def parse_args(args=None):
return parser.parse_args()


# Peptide Predictions remain unsorted, otherwise full dataframe needs to be loaded to memory, which would be computational very intensive.

def main(args=None):
args = parse_args(args)

first_header = pd.DataFrame().columns
for i, filename in enumerate(args.input):
print("Processing file: ", filename, flush=True)

# Read input file chunk-wise
with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader:
for j, tsv_chunk in enumerate(reader):
print(" Chunk: ", j, flush=True)
if i == 0 and j == 0:
first_header = tsv_chunk.columns
print("Header: ", first_header.tolist(), flush=True)
tsv_chunk.to_csv(args.output, mode="w", sep="\t", index=False, header=True)
else:
if j == 0:
# Check if header of subsequent input files match header of first input file
# (column order must be the same)
if tsv_chunk.columns.tolist() != first_header.tolist():
print(
"ERROR - header of input file",
filename,
"does not match the header of the first input file!",
file=sys.stderr,
)
sys.exit(1)

tsv_chunk.to_csv(args.output, mode="a", sep="\t", index=False, header=False)
with TextIOWrapper(GzipFile(args.output, 'w', mtime=0), encoding='utf-8') as outfile:

first_header = pd.DataFrame().columns
for i, filename in enumerate(args.input):
print("Processing file: ", filename, flush=True)

# Read input file chunk-wise
with pd.read_csv(filename, sep="\t", chunksize=args.chunk_size) as reader:
for j, tsv_chunk in enumerate(reader):
print(" Chunk: ", j, flush=True)
if i == 0 and j == 0:
first_header = tsv_chunk.columns
print("Header: ", first_header.tolist(), flush=True)
tsv_chunk.to_csv(outfile, mode="w", sep="\t", index=False, header=True)
else:
if j == 0:
# Check if header of subsequent input files match header of first input file
# (column order must be the same)
if tsv_chunk.columns.tolist() != first_header.tolist():
print(
"ERROR - header of input file",
filename,
"does not match the header of the first input file!",
file=sys.stderr,
)
sys.exit(1)

tsv_chunk.to_csv(outfile, mode="a", sep="\t", index=False, header=False)


if __name__ == "__main__":
Expand Down
8 changes: 4 additions & 4 deletions bin/download_proteins_entrez.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,12 @@ def main(args=None):
+ "It needs to be a tsv file containing 'taxon_id' and/or optionally 'assembly_id' and 'abundance."
)

taxIds = list(set(taxIds))
taxIds = sorted(list(set(taxIds)))
####################################################################################################
# Process TaxIDs

print("Processing the following taxonomy IDs:")
print(taxIds)
print(sorted(taxIds))

####################################################################################################
# 0) Check if the taxids link to a strain level organism
Expand Down Expand Up @@ -307,7 +307,7 @@ def main(args=None):
# some proteins, such as 487413233, occur within multiple sequences of the assembly!
# -> assembly only listed once!

proteinIds = list(dict_proteinId_assemblyIds.keys())
proteinIds = sorted(list(dict_proteinId_assemblyIds.keys()))

print("# proteins (unique): ", len(proteinIds))
# -> # proteins with refseq source (<= # IPG proteins)
Expand Down Expand Up @@ -394,7 +394,7 @@ def main(args=None):
for proteinId in proteinIds:
accVersion = dict_protein_uid_acc[proteinId]
# write out protein_tmp_id, entity_name (taxon_id)
for assemblyId in dict_proteinId_assemblyIds[proteinId]:
for assemblyId in sorted(dict_proteinId_assemblyIds[proteinId]):
taxId = dict_assemblyId_taxId[assemblyId]
print(accVersion, taxId, sep="\t", file=args.entities_proteins, flush=True)

Expand Down
8 changes: 6 additions & 2 deletions bin/fasta_to_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@
)
args = parser.parse_args()

records_out = []
with gzip.open(args.input, "rt") as handle:
for record in SeqIO.parse(handle, "fasta"):
if args.remove_asterisk and record.seq[-1] == "*":
print(f"{record.id}\t{record.seq[:-1]}")
records_out.append([str(record.id),"\t",str(record.seq[:-1]),"\n"])
else:
print(f"{record.id}\t{record.seq}")
records_out.append([str(record.id),"\t",str(record.seq),"\n"])
# Two dimensional array to enable sorting
records_out = sorted(records_out, key=lambda x: x[0])
print("".join(["".join(rec) for rec in records_out]))
2 changes: 1 addition & 1 deletion bin/generate_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def main(args=None):
results["count"] = pd.to_numeric(results["count"], downcast="unsigned")
# prepare df for joining
results.set_index("peptide_sequence", inplace=True)
results.sort_index(inplace=True)
results.sort_index(inplace=True, kind="stable")

unique_peptides = pd.DataFrame(index=results.index.drop_duplicates())
unique_peptides["peptide_id"] = range(id_counter, id_counter + len(unique_peptides))
Expand Down
2 changes: 1 addition & 1 deletion bin/generate_protein_and_entity_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def main(args=None):
# (i.e. microbiome_bare_id) and assign the corresponding microbiome_id to the entities
for microbiome_id in microbiomes.groupby("microbiome_bare_id").get_group(microbiome_bare_id)[
"microbiome_id"
]:
].sort_values():
entities = pd.DataFrame()
entities = proteins[["entity_name"]].drop_duplicates()
entities["entity_id"] = range(next_entity_id, next_entity_id + len(entities))
Expand Down
2 changes: 1 addition & 1 deletion bin/unify_model_lengths.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def main():
samplesheet = pd.read_csv(args.input)

# Retrieve unique list of alleles
alleles_s = {allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")}
alleles_s = sorted({allele for allele_list in samplesheet["alleles"] for allele in allele_list.split(" ")})
log_str += f"Found the following alleles: {', '.join(alleles_s)}\n\n"
# Parse alleles to epytope convention
predictor = EpitopePredictorFactory(args.method)
Expand Down
2 changes: 1 addition & 1 deletion tests/pipeline/test.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@
},
"timestamp": "2024-06-26T08:14:54+0000"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_all.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@
},
"timestamp": "2024-07-17T15:58:41.467969153"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_assembly_only.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@
},
"timestamp": "2024-06-26T08:14:54+0000"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_bins_only.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@
},
"timestamp": "2024-06-26T08:14:54+0000"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_coassembly.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@
},
"timestamp": "2024-06-26T08:14:54+0000"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_mhcflurry.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@
},
"timestamp": "2024-06-26T08:14:54+0000"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_mhcnuggets_1.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@
},
"timestamp": "2024-06-26T08:14:54+0000"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_mhcnuggets_2.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@
},
"timestamp": "2024-06-26T08:14:54+0000"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_mouse.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@
},
"timestamp": "2024-07-18T05:18:07.866546467"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_mouse_all_pep_lengths.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@
},
"timestamp": "2024-06-26T08:14:54+0000"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_taxa_only.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@
},
"timestamp": "2024-07-18T14:49:36.871618584"
}
}
}
2 changes: 1 addition & 1 deletion tests/pipeline/test_taxa_specific_assembly.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@
},
"timestamp": "2024-07-17T16:08:40.670649421"
}
}
}
4 changes: 2 additions & 2 deletions workflows/metapep.nf
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ workflow METAPEP {
ch_merge_predictions_input_warn = MERGE_PREDICTIONS_BUFFER.out.ch_prediction_warnings_merged_buffer.mix(ch_predictions_unbuffered.warnings)

MERGE_PREDICTIONS (
ch_merge_predictions_input_pred.collect(),
ch_merge_predictions_input_warn.collect()
ch_merge_predictions_input_pred.collect(sort: { it.baseName }),
ch_merge_predictions_input_warn.collect(sort: { it.baseName })
)
ch_versions = ch_versions.mix(MERGE_PREDICTIONS.out.versions)

Expand Down

0 comments on commit 56fdc11

Please sign in to comment.