-
Notifications
You must be signed in to change notification settings - Fork 9
/
ground_truth_mapper.py
73 lines (56 loc) · 1.92 KB
/
ground_truth_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""Methods to convert ground truth labels to the common data format."""
import re
REPLACEMENTS = []
PTM_PATTERN = r"([A-Z])\[([0-9.+-]+)\]" # find AAs with PTMs
N_TERM_MOD_PATTERN = r"^n\[([0-9.+-]+)\]" # find N-term modifications
def _transform_match_ptm(match: re.Match) -> str:
"""
Transform representation of amino acids substring matching
the PTM pattern.
Expects PTMs in ProForma notation, e.g. 'M[UNIMOD:35]'.
Parameters
----------
match : re.Match
Substring matching the PTM pattern.
Returns
-------
transformed_match : str
Transformed PTM pattern representation.
"""
aa, ptm = match.group(1), match.group(2)
if not ptm.startswith("-"):
ptm = "+" + ptm
return f"{aa}[{ptm}]"
def _transform_match_n_term_mod(match: re.Match) -> str:
"""
Transform representation of peptide substring matching
the N-term modification pattern.
`n[+n_mod]PEP` -> `[+n_mod]-PEP`
TODO.
"""
ptm = match.group(1)
if not ptm.startswith("-"):
ptm = "+" + ptm
return f"[{ptm}]-"
def format_sequence(sequence: str) -> str:
"""
Convert peptide sequence to the common output data format.
Parameters
----------
sequence : str
Peptide sequence in the original ground truth format.
Returns
-------
transformed_sequence : str
Peptide sequence in the common output data format.
"""
# direct (token-to-token) replacements (if any)
for repl_args in REPLACEMENTS:
sequence = sequence.replace(*repl_args)
# transformation of PTM notation:
# represent in ProForma delta mass notation PE[+ptm]P
sequence = re.sub(PTM_PATTERN, _transform_match_ptm, sequence)
# transform n-term modification notation
# represent in ProForma delta mass notation [+n_term_mod]-PEP
sequence = re.sub(N_TERM_MOD_PATTERN, _transform_match_n_term_mod, sequence)
return sequence