Skip to content

Commit

Permalink
Merge branch 'testing' into nf-test_assertions
Browse files Browse the repository at this point in the history
  • Loading branch information
tillenglert committed Aug 5, 2024
2 parents b002195 + dc0969a commit 3487024
Show file tree
Hide file tree
Showing 31 changed files with 192 additions and 743 deletions.
6 changes: 3 additions & 3 deletions bin/download_proteins_entrez.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def main(args=None):
+ "It needs to be a tsv file containing 'taxon_id' and/or optionally 'assembly_id' and 'abundance."
)

taxIds = list(set(taxIds))
taxIds = sorted(list(set(taxIds)))
####################################################################################################
# Process TaxIDs

Expand Down Expand Up @@ -307,7 +307,7 @@ def main(args=None):
# some proteins, such as 487413233, occur within multiple sequences of the assembly!
# -> assembly only listed once!

proteinIds = list(dict_proteinId_assemblyIds.keys())
proteinIds = sorted(list(dict_proteinId_assemblyIds.keys()))

print("# proteins (unique): ", len(proteinIds))
# -> # proteins with refseq source (<= # IPG proteins)
Expand Down Expand Up @@ -394,7 +394,7 @@ def main(args=None):
for proteinId in proteinIds:
accVersion = dict_protein_uid_acc[proteinId]
# write out protein_tmp_id, entity_name (taxon_id)
for assemblyId in dict_proteinId_assemblyIds[proteinId]:
for assemblyId in sorted(dict_proteinId_assemblyIds[proteinId]):
taxId = dict_assemblyId_taxId[assemblyId]
print(accVersion, taxId, sep="\t", file=args.entities_proteins, flush=True)

Expand Down
8 changes: 6 additions & 2 deletions bin/fasta_to_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@
)
args = parser.parse_args()

records_out = []
with gzip.open(args.input, "rt") as handle:
for record in SeqIO.parse(handle, "fasta"):
if args.remove_asterisk and record.seq[-1] == "*":
print(f"{record.id}\t{record.seq[:-1]}")
records_out.append([str(record.id),"\t",str(record.seq[:-1]),"\n"])
else:
print(f"{record.id}\t{record.seq}")
records_out.append([str(record.id),"\t",str(record.seq),"\n"])
# Two dimensional array to enable sorting
records_out = sorted(records_out, key=lambda x: x[0])
print("".join(["".join(rec) for rec in records_out]))
21 changes: 11 additions & 10 deletions bin/generate_peptides.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/usr/bin/env python3

import argparse
from gzip import GzipFile
from io import TextIOWrapper
import gzip
import sys

import pandas as pd
Expand Down Expand Up @@ -127,7 +126,7 @@ def main(args=None):

####################
# generate peptides
with TextIOWrapper(GzipFile(args.peptides, 'w', mtime=0), encoding='utf-8') as pep_handle:
with gzip.open(args.peptides, "wt") as pep_handle:
print_header = True
id_counter = 0

Expand Down Expand Up @@ -158,21 +157,23 @@ def main(args=None):
results = results.groupby(["protein_id", "peptide_sequence"]).size().reset_index(name="count")
# -> protein_id, peptide_sequence, count
results["count"] = pd.to_numeric(results["count"], downcast="unsigned")
# prepare df for joining
results.set_index("peptide_sequence", inplace=True)
results.sort_index(inplace=True, kind="stable")

results = results.sort_values(by="peptide_sequence")
pep_ids = results.groupby("peptide_sequence").ngroup()
results["peptide_id"] = pep_ids + id_counter
id_counter = id_counter + len(pep_ids)

unique_peptides = pd.DataFrame(index=results.index.drop_duplicates())
unique_peptides["peptide_id"] = range(id_counter, id_counter + len(unique_peptides))
id_counter += len(unique_peptides)
# -> peptide_sequence, peptide_id
results[["peptide_id", "peptide_sequence"]].drop_duplicates().sort_values(by=["peptide_sequence","peptide_id"]).to_csv(pep_handle, mode="a", sep="\t", index=False, header=print_header)
unique_peptides.to_csv(pep_handle, mode="a", sep="\t", index=True, header=print_header)

results = results.join(unique_peptides)
# -> protein_id, peptide_sequence, count, peptide_id

print("\nInfo: results (['protein_id','peptide_sequence','peptide_id','count'])", flush=True)
results.info(verbose=False, memory_usage=print_mem)

results[["protein_id", "peptide_id", "count"]].drop_duplicates().sort_values(by=["protein_id", "peptide_id"]).to_csv(
results[["protein_id", "peptide_id", "count"]].to_csv(
args.proteins_peptides, mode="a", sep="\t", index=False, header=print_header
)

Expand Down
114 changes: 67 additions & 47 deletions bin/generate_protein_and_entity_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
# proteins from 'proteins' input type: not known if unique or not, handle separately for now (in case of unique ids this causes unnecessary redundancy; could add parameter for this in future)

import argparse
from gzip import GzipFile
from io import TextIOWrapper
import gzip
import sys

import pandas as pd
Expand Down Expand Up @@ -86,14 +85,24 @@ def parse_args(args=None):
def main(args=None):
args = parse_args(args)

next_protein_id = 0
next_entity_id = 0

proteins_columns = ["protein_id", "protein_orig_id", "protein_sequence"]
entities_proteins_columns = ["entity_id", "protein_id"]
entities_columns = ["entity_id", "entity_name"]
microbiomes_entities_columns = ["microbiome_id", "entity_id"]
with TextIOWrapper(GzipFile(args.out_proteins, 'w', mtime=0), encoding='utf-8') as outfile_proteins:
with gzip.open(args.out_proteins, "wt") as outfile_proteins, open(
args.out_entities_proteins, "w"
) as outfile_entities_proteins, open(args.out_entities, "w") as outfile_entities, open(
args.out_microbiomes_entities, "w"
) as outfile_microbiomes_entities:
# HEADERS
print("\t".join(proteins_columns), file=outfile_proteins)
print("\t".join(entities_proteins_columns), file=outfile_entities_proteins)
print("\t".join(entities_columns), file=outfile_entities)
print("\t".join(microbiomes_entities_columns), file=outfile_microbiomes_entities)

entities_dfs = []
proteins_dfs = []
#
# PREDICTED PROTEINS
#
Expand All @@ -118,7 +127,6 @@ def main(args=None):
# Read all provided files while checking in each microbiome_bare_id
# Bins contain multiple files within one filepath (gzipped) corresponding to one microbiome_bare_id
check_in_microbiome_bare_id = set()

for microbiome_bare_id, bin_basename, inpath in zip(
args.predicted_proteins_microbiome_ids, args.predicted_proteins_bin_basenames, args.predicted_proteins
):
Expand All @@ -132,31 +140,49 @@ def main(args=None):
else:
proteins["entity_name"] = bin_basename

proteins["protein_id"] = range(next_protein_id, next_protein_id + len(proteins))
next_protein_id += len(proteins)

# Check if microbiome is coassembly
if len(microbiomes.groupby("microbiome_bare_id").get_group(microbiome_bare_id)) != 1:
all_entities = []
# Iterate over microbiome_ids associated to current co-assembly
# (i.e. microbiome_bare_id) and assign the corresponding microbiome_id to the entities
for microbiome_id in microbiomes.groupby("microbiome_bare_id").get_group(microbiome_bare_id)[
"microbiome_id"
]:
].sort_values():
entities = pd.DataFrame()
entities = proteins[["entity_name"]].drop_duplicates()

entities["entity_id"] = range(next_entity_id, next_entity_id + len(entities))
# Instead of microbiome_bare_id append microbiome_id
entities["microbiome_id"] = microbiome_id
all_entities.append(entities)

next_entity_id += len(entities)
entities = pd.concat(all_entities)

else:
entities = proteins[["entity_name"]].drop_duplicates()
entities["entity_id"] = range(next_entity_id, next_entity_id + len(entities))
next_entity_id += len(entities)
# If not coassembled microbiome_id = microbiome_bare_id
entities["microbiome_id"] = microbiome_bare_id

# Merge all dfs of all microbiomes
proteins_dfs.append(proteins)
entities_dfs.append(entities)
# Write proteins
proteins.rename(columns={"protein_tmp_id": "protein_orig_id"}, inplace=True)
proteins[proteins_columns].to_csv(outfile_proteins, sep="\t", header=False, index=False)
# Write entities_proteins
proteins.merge(entities)[["entity_id", "protein_id"]].drop_duplicates().to_csv(
outfile_entities_proteins, sep="\t", header=False, index=False
)
# Write entities
entities[entities_columns].drop_duplicates().to_csv(
outfile_entities, sep="\t", header=False, index=False
)
# Write microbiomes - entities
entities[microbiomes_entities_columns].to_csv(
outfile_microbiomes_entities, sep="\t", index=False, header=False
)

#
# ENTREZ PROTEINS
Expand All @@ -174,43 +200,37 @@ def main(args=None):
args.entrez_microbiomes_entities, "\t"
) # entity_name, microbiome_id, entity_weight

# Collect Entities
entities = microbiomes_entities[["entity_name", "microbiome_id"]].drop_duplicates()
proteins = proteins.merge(entities_proteins)

entities_dfs.append(entities)
proteins_dfs.append(proteins)

proteins = pd.concat(proteins_dfs, ignore_index=True)
entities = pd.concat(entities_dfs, ignore_index=True)

# Collect Entities and sort them
entities["entity_name"] = entities["entity_name"].astype(str) # Taxids are read as numeric and cannot be compared to bins/assembly ids
entities = entities.sort_values(by="entity_name")
entities["entity_id"] = entities.groupby("entity_name").ngroup()

# Collect Proteins ans sort them
proteins.rename(columns={"protein_tmp_id": "protein_orig_id"}, inplace=True)
proteins["entity_name"] = proteins["entity_name"].astype(str) # Taxids are read as numeric and cannot be compared to bins/assembly ids
proteins = proteins.sort_values(by="protein_orig_id")
proteins["protein_id"] = proteins.groupby("protein_orig_id").ngroup()


# Write Proteins
proteins[proteins_columns].drop_duplicates().sort_values(by="protein_id").to_csv(outfile_proteins, sep="\t", index=False)

# Write Entities-Proteins
proteins.merge(entities)[entities_proteins_columns].drop_duplicates().sort_values(by=entities_proteins_columns).to_csv(
args.out_entities_proteins, sep="\t", index=False
)

# Write entities
entities[entities_columns].sort_values(by=entities_columns).drop_duplicates().to_csv(args.out_entities, sep="\t", index=False)
# Assign protein_id
proteins["protein_id"] = range(next_protein_id, next_protein_id + len(proteins))
next_protein_id += len(proteins)

# Assign entity_id
entities = microbiomes_entities[["entity_name"]].drop_duplicates()
entities["entity_id"] = range(next_entity_id, next_entity_id + len(entities))
next_entity_id += len(entities)

# Write proteins
proteins.rename(columns={"protein_tmp_id": "protein_orig_id"})[proteins_columns].to_csv(
outfile_proteins, sep="\t", header=False, index=False
)

entities_microbiomes_proteins = (
entities_proteins.merge(proteins)
.merge(entities)
.merge(microbiomes_entities)[["entity_id", "protein_id", "microbiome_id", "entity_weight"]]
)

# Write entities_proteins: 'entity_id', 'protein_id'
entities_microbiomes_proteins[entities_proteins_columns].to_csv(
outfile_entities_proteins, sep="\t", header=False, index=False
)
# Write entities: 'entity_id', 'entity_name'
entities[entities_columns].to_csv(outfile_entities, sep="\t", header=False, index=False)
# Write microbiomes - entities: 'microbiome_id', 'entity_id'
entities_microbiomes_proteins[microbiomes_entities_columns].drop_duplicates().to_csv(
outfile_microbiomes_entities, sep="\t", header=False, index=False
)

# Write microbiomes - entities
entities[microbiomes_entities_columns].sort_values(by=microbiomes_entities_columns).to_csv(
args.out_microbiomes_entities, sep="\t", index=False
)

if __name__ == "__main__":
sys.exit(main())
3 changes: 0 additions & 3 deletions testdata/input.assembly_only.csv

This file was deleted.

3 changes: 0 additions & 3 deletions testdata/input.bins.csv

This file was deleted.

5 changes: 0 additions & 5 deletions testdata/input.csv

This file was deleted.

5 changes: 0 additions & 5 deletions testdata/input.mouse.csv

This file was deleted.

4 changes: 0 additions & 4 deletions testdata/input.tiny.csv

This file was deleted.

4 changes: 0 additions & 4 deletions testdata/input.tmp.csv

This file was deleted.

3 changes: 0 additions & 3 deletions testdata/taxids.tiny.txt

This file was deleted.

3 changes: 0 additions & 3 deletions testdata/taxids.txt

This file was deleted.

Binary file removed testdata/test_bins.tar.gz
Binary file not shown.
Loading

0 comments on commit 3487024

Please sign in to comment.