Skip to content

Commit

Permalink
extended the mutations search to INDELs
Browse files Browse the repository at this point in the history
  • Loading branch information
jonas-fuchs committed Oct 22, 2023
1 parent 79087c9 commit 523a1b0
Showing 1 changed file with 110 additions and 30 deletions.
140 changes: 110 additions & 30 deletions bamdash/scripts/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
# BUILT INS
import statistics
import math

# LIBS
import pandas as pd
Expand Down Expand Up @@ -280,9 +281,107 @@ def bed_to_dict(bed_file, coverage_df, ref, min_cov):
return define_track_position(bed_dict)


def get_mutations(start, stop, cds, variant, seq):
"""
classifiy the mutation effect on the amino acid level
inspired by SNPeff
:param start: start of the cds
:param stop: stop of the cds
:param cds: cds dict
:param variant: slice of variant df
:param seq: sequence of the ref
:return: amino acid change, mutation effect
"""

as_exchange, as_effect = "NONE", "NONE"

# get the CDS sequence, cds pos of the variant and the codon pos
if "codon_start" in cds:
codon_start = int(cds["codon_start"])
else:
codon_start = 1
# reverse complement depending on the cds direction
if cds["strand"] == "-":
seq_cds = seq[start + codon_start - 2:stop].reverse_complement()
cds_variant_pos = stop - variant["position"]
ref = str(Seq.Seq(variant["reference"]).reverse_complement())
mut = str(Seq.Seq(variant["mutation"]).reverse_complement())
else:
seq_cds = seq[start + codon_start - 2:stop]
cds_variant_pos = variant["position"] - start
ref, mut = variant["mutation"], variant["mutation"]

# define codons -> get the codon with the mutation and the position in the codon
cds_codons = [list(seq_cds[i:i + 3]) for i in range(0, len(seq_cds), 3)]
mut_codon, codon_pos = math.floor(cds_variant_pos / 3), cds_variant_pos % 3 # index affected codon, codon pos
ref_ac = Seq.Seq("".join(cds_codons[mut_codon])).translate() # ref aminoacid

# define the mutation
if variant["type"] == "SNP":
# get mut aminoacid
cds_codons[mut_codon][codon_pos] = mut
alt_ac = Seq.Seq("".join(cds_codons[mut_codon])).translate()
# define mutation type
if ref_ac == alt_ac:
as_effect = "SYN"
elif alt_ac == "*":
as_exchange, as_effect = f"{ref_ac}{mut_codon + 1}{alt_ac}", "STOP_GAINED"
else:
as_exchange, as_effect = f"{ref_ac}{mut_codon + 1}{alt_ac}", "NON_SYN"
elif variant["type"] == "INS":
# 1st case: triplet insertion
if (len(mut) - 1) % 3 == 0:
# mutate the codon
cds_codons[mut_codon][codon_pos] = mut
ins = Seq.Seq("".join(cds_codons[mut_codon][codon_pos])).translate()
# check if the 1st aminoacid has not changed ...
if ins[0] == ref_ac:
as_effect = "CODON_INSERTION"
# ... changed to a stop ...
elif "* " in ins:
as_effect = "STOP_GAINED"
# or has changed
else:
as_effect = "CODON_CHANGE+CODON_INSERTION"
as_exchange = f"{ref_ac}{mut_codon + 1}{ins}"
# 2nd case: non-triplet insertion -> frameshift
else:
as_effect, as_exchange = "FRAMESHIFT", f"{ref_ac}{mut_codon + 1}fsX"
elif variant["type"] == "DEL":
# 1st case: triplet insertion
if (len(ref) - 1) % 3 == 0:
# check if it is the last codon position -> deletion of the next x amninoacids
if codon_pos == 2:
deletion = Seq.Seq(cds_codons[mut_codon] + ref[1:]).translate()
new_ac = ref_ac
# otherwise check which codons are affected and how the new codon looks like
else:
del_codons = cds_codons[mut_codon:int(mut_codon + 1 + (len(ref) - 1) / 3)] # affected codons
deletion = Seq.Seq("".join(sum(del_codons, []))).translate()
# define potential new amino acid by joing the first and last triplet
new_ac = Seq.Seq(
"".join(
del_codons[0][:codon_pos] + del_codons[len(del_codons) - 1][codon_pos + 1:]
)
).translate()
# check if the first aminoacid has changed
if new_ac == ref_ac:
as_effect = "CODON_DELETION"
else:
as_effect = "CODON_CHANGE+CODON_DELETION"
as_exchange = f"{ref_ac}{deletion}{mut_codon + 1}{new_ac}"
# 2nd case: non-triplet insertion
else:
as_effect, as_exchange = "FRAMESHIFT", f"{ref_ac}{mut_codon + 1}fsX"

return as_exchange, as_effect


def annotate_vcf_df(vcf_df, cds_dict, seq):
"""
annotate mutations for their as effect
annotate mutations for their aminoacid effect
:param vcf_df: dataframe with vcf data
:param cds_dict: dictionary with all coding sequences
:param seq: sequence of the reference
Expand All @@ -296,54 +395,35 @@ def annotate_vcf_df(vcf_df, cds_dict, seq):

for variant in vcf_df.iterrows():
proteins_temp, as_exchange_temp, as_effect_temp = [], [], []
pos, mut_type, mut = variant[1]["position"], variant[1]["type"], variant[1]["mutation"]
# check each cds for potential mutations
for cds in cds_dict:
# check if a protein identifier is present
if not any(identifier in potential_cds_identifiers for identifier in cds_dict[cds]):
continue
start, stop = cds_dict[cds]["start"], cds_dict[cds]["stop"]
# check if pos is in range
if pos not in range(start, stop):
if variant[1]["position"] not in range(start, stop+1):
continue
# now we know the protein
for identifier in potential_cds_identifiers:
if identifier in cds_dict[cds]:
proteins_temp.append(cds_dict[cds][identifier])
break
# at the moment only check SNPs
if mut_type != "SNP":
as_exchange_temp.append("AMBIG"), as_effect_temp.append(variant[1]["type"])
continue
# now we can analyse for a potential as exchange
if "codon_start" in cds_dict[cds]:
codon_start = int(cds_dict[cds]["codon_start"])
else:
codon_start = 1
strand, seq_mut = cds_dict[cds]["strand"], str(seq)
# mutate the sequence and get the CDS nt seq
seq_mut = Seq.Seq(seq_mut[:pos-1] + mut + seq_mut[pos:])
seq_cds, seq_mut_cds = seq[start+codon_start-2:stop], seq_mut[start+codon_start-2:stop]
# translate strand depend
if strand == "+":
cds_trans, cds_mut_trans = seq_cds.translate(), seq_mut_cds.translate()
else:
cds_trans, cds_mut_trans = seq_cds.reverse_complement().translate(), seq_mut_cds.reverse_complement().translate()
# get the mutation by searching for a diff between string - prop a bit slow
mut_string = [f"{x}{i+1}{y}" for i, (x, y) in enumerate(zip(cds_trans, cds_mut_trans)) if x != y]
# now append to list
if not mut_string:
as_exchange_temp.append("NONE"), as_effect_temp.append("SYN")
else:
as_exchange_temp.append(mut_string[0]), as_effect_temp.append("NON-SYN")
# annotate mutations
amino_acid_mutations = get_mutations(start, stop, cds_dict[cds], variant[1], seq)
as_exchange_temp.append(amino_acid_mutations[0]), as_effect_temp.append(amino_acid_mutations[1])
# check if we did not find a protein
if not proteins_temp:
proteins.append("NONE"), as_exchange.append("NONE"), as_effect.append("NONE")
# else append all mutations found in all cds
elif len(proteins_temp) == 1:
proteins.append(proteins_temp[0]), as_exchange.append(as_exchange_temp[0]), as_effect.append(as_effect_temp[0])
proteins.append(proteins_temp[0])
as_exchange.append(as_exchange_temp[0])
as_effect.append(as_effect_temp[0])
else:
proteins.append(" | ".join(proteins_temp)), as_exchange.append(" | ".join(as_exchange_temp)), as_effect.append(" | ".join(as_effect_temp))
proteins.append(" | ".join(proteins_temp))
as_exchange.append(" | ".join(as_exchange_temp))
as_effect.append(" | ".join(as_effect_temp))

# annotate and return df
vcf_df["protein"], vcf_df["effect"], vcf_df["as mutation"] = proteins, as_effect, as_exchange
Expand Down

0 comments on commit 523a1b0

Please sign in to comment.