From a0518f8f75589c150948ff34bd13b477b25b33b3 Mon Sep 17 00:00:00 2001
From: zethson <lukas.heumos@posteo.net>
Date: Tue, 19 Nov 2019 21:49:00 +0100
Subject: [PATCH] [FIX] various allele naming hicups

---
 Fred2/EpitopePrediction/ANN.py | 158 ++++++++++++++++++---------------
 1 file changed, 85 insertions(+), 73 deletions(-)

diff --git a/Fred2/EpitopePrediction/ANN.py b/Fred2/EpitopePrediction/ANN.py
index f6936408..f59aa932 100644
--- a/Fred2/EpitopePrediction/ANN.py
+++ b/Fred2/EpitopePrediction/ANN.py
@@ -80,24 +80,24 @@ class MHCNuggetsPredictor_1(AANNEpitopePrediction):
         """
         __metaclass__ = SignatureCheckerMeta
         __alleles = frozenset(
-            ["HLA-A01:01", "HLA-A02:01", "HLA-A02:02", "HLA-A02:03", "HLA-A02:04", "HLA-A02:05", "HLA-A02:06",
-             "HLA-A02:07", "HLA-A02:08", "HLA-A02:09", "HLA-A02:10", "HLA-A02:11", "HLA-A02:12", "HLA-A02:14",
-             "HLA-A02:16", "HLA-A02:17", "HLA-A02:19", "HLA-A02:50", "HLA-A03:01", "HLA-A03:02", "HLA-A03:19",
-             "HLA-A11:01", "HLA-A11:02", "HLA-A23:01", "HLA-A24:01", "HLA-A24:02", "HLA-A24:03", "HLA-A25:01",
-             "HLA-A26:01", "HLA-A26:02", "HLA-A26:03", "HLA-A29:01", "HLA-A29:02", "HLA-A30:01", "HLA-A30:02",
-             "HLA-A30:03", "HLA-A30:04", "HLA-A31:01", "HLA-A32:01", "HLA-A32:07", "HLA-A32:15", "HLA-A33:01",
-             "HLA-A33:03", "HLA-A66:01", "HLA-A68:01", "HLA-A68:02", "HLA-A68:23", "HLA-A69:01", "HLA-A74:01",
-             "HLA-A80:01", "HLA-B07:01", "HLA-B07:02", "HLA-B08:01", "HLA-B08:02", "HLA-B08:03", "HLA-B12:01",
-             "HLA-B13:02", "HLA-B14:01", "HLA-B14:02", "HLA-B15:01", "HLA-B15:02", "HLA-B15:03", "HLA-B15:08",
-             "HLA-B15:09", "HLA-B15:10", "HLA-B15:13", "HLA-B15:16", "HLA-B15:17", "HLA-B15:42", "HLA-B18:01",
-             "HLA-B27:01", "HLA-B27:02", "HLA-B27:03", "HLA-B27:04", "HLA-B27:05", "HLA-B27:06", "HLA-B27:09",
-             "HLA-B27:10", "HLA-B27:20", "HLA-B35:01", "HLA-B35:02", "HLA-B35:03", "HLA-B35:08", "HLA-B37:01",
-             "HLA-B38:01", "HLA-B39:01", "HLA-B39:06", "HLA-B39:09", "HLA-B39:10", "HLA-B40:01", "HLA-B40:02",
-             "HLA-B40:13", "HLA-B41:03", "HLA-B41:04", "HLA-B42:01", "HLA-B42:02", "HLA-B44:01", "HLA-B44:02",
-             "HLA-B44:03", "HLA-B44:05", "HLA-B45:01", "HLA-B45:06", "HLA-B46:01", "HLA-B48:01", "HLA-B51:01",
-             "HLA-B51:02", "HLA-B52:01", "HLA-B53:01", "HLA-B54:01", "HLA-B55:01", "HLA-B55:02", "HLA-B56:01",
-             "HLA-B57:01", "HLA-B57:02", "HLA-B57:03", "HLA-B58:01", "HLA-B58:02", "HLA-B60:01", "HLA-B61:01",
-             "HLA-B62:01", "HLA-B73:01", "HLA-B81:01", "HLA-B83:01"])
+            ["HLA-A*01:01", "HLA-A*02:01", "HLA-A*02:02", "HLA-A*02:03", "HLA-A*02:04", "HLA-A*02:05", "HLA-A*02:06",
+             "HLA-A*02:07", "HLA-A*02:08", "HLA-A*02:09", "HLA-A*02:10", "HLA-A*02:11", "HLA-A*02:12", "HLA-A*02:14",
+             "HLA-A*02:16", "HLA-A*02:17", "HLA-A*02:19", "HLA-A*02:50", "HLA-A*03:01", "HLA-A*03:02", "HLA-A*03:19",
+             "HLA-A*11:01", "HLA-A*11:02", "HLA-A*23:01", "HLA-A*24:01", "HLA-A*24:02", "HLA-A*24:03", "HLA-A*25:01",
+             "HLA-A*26:01", "HLA-A*26:02", "HLA-A*26:03", "HLA-A*29:01", "HLA-A*29:02", "HLA-A*30:01", "HLA-A*30:02",
+             "HLA-A*30:03", "HLA-A*30:04", "HLA-A*31:01", "HLA-A*32:01", "HLA-A*32:07", "HLA-A*32:15", "HLA-A*33:01",
+             "HLA-A*33:03", "HLA-A*66:01", "HLA-A*68:01", "HLA-A*68:02", "HLA-A*68:23", "HLA-A*69:01", "HLA-A*74:01",
+             "HLA-A*80:01", "HLA-B*07:01", "HLA-B*07:02", "HLA-B*08:01", "HLA-B*08:02", "HLA-B*08:03", "HLA-B*12:01",
+             "HLA-B*13:02", "HLA-B*14:01", "HLA-B*14:02", "HLA-B*15:01", "HLA-B*15:02", "HLA-B*15:03", "HLA-B*15:08",
+             "HLA-B*15:09", "HLA-B*15:10", "HLA-B*15:13", "HLA-B*15:16", "HLA-B*15:17", "HLA-B*15:42", "HLA-B*18:01",
+             "HLA-B*27:01", "HLA-B*27:02", "HLA-B*27:03", "HLA-B*27:04", "HLA-B*27:05", "HLA-B*27:06", "HLA-B*27:09",
+             "HLA-B*27:10", "HLA-B*27:20", "HLA-B*35:01", "HLA-B*35:02", "HLA-B*35:03", "HLA-B*35:08", "HLA-B*37:01",
+             "HLA-B*38:01", "HLA-B*39:01", "HLA-B*39:06", "HLA-B*39:09", "HLA-B*39:10", "HLA-B*40:01", "HLA-B*40:02",
+             "HLA-B*40:13", "HLA-B*41:03", "HLA-B*41:04", "HLA-B*42:01", "HLA-B*42:02", "HLA-B*44:01", "HLA-B*44:02",
+             "HLA-B*44:03", "HLA-B*44:05", "HLA-B*45:01", "HLA-B*45:06", "HLA-B*46:01", "HLA-B*48:01", "HLA-B*51:01",
+             "HLA-B*51:02", "HLA-B*52:01", "HLA-B*53:01", "HLA-B*54:01", "HLA-B*55:01", "HLA-B*55:02", "HLA-B*56:01",
+             "HLA-B*57:01", "HLA-B*57:02", "HLA-B*57:03", "HLA-B*58:01", "HLA-B*58:02", "HLA-B*60:01", "HLA-B*61:01",
+             "HLA-B*62:01", "HLA-B*73:01", "HLA-B*81:01", "HLA-B*83:01"])
         __supported_length = frozenset([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
         __name = "mhcnuggets-class-1"
         __version = "2.0"
@@ -123,31 +123,29 @@ def version(self):
             # returns the version of the predictor
             return self.__version
 
-        # the interface defines a function converting Fred2's HLA allele presentation
-        # into an internal presentation used by different methods.
-        # for this predictor we won't need it but still have to provide it!
-        # the function consumes a list of alleles and converts them into the internally used presentation
+        # Converts FRED2s internal allele representation into the format required by mhcnuggets
         def convert_alleles(self, alleles):
-            # we just use the identity function
-            return alleles
+            return ['HLA-' + allele.name.replace('*', '') for allele in alleles]
 
-        # additionally the interface defines a function `predict`
-        # that consumes a list of peptides or a single peptide and optionally a list of allele objects
-        # this method implements the complete prediction routine
+        # Converts the internal mhcnuggets-class-1 representation back into a FRED2 representation
+        def revert_allele_repr(self, allele):
+            allele = allele.replace('HLA-', '')
+            allele = allele[:1] + '*' + allele[1:]
+            return allele 
+
+        # predicts the binding affinity for a set of peptides and alleles
         def predict(self, peptides, alleles=None, binary=False, **kwargs):
 
             # test whether one peptide or a list
-            if isinstance(peptides, basestring):
-                peptides = list(peptides)
+            if not isinstance(peptides, list):
+                peptides = [peptides]
 
             # if no alleles are specified do predictions for all supported alleles
             if alleles is None:
                 alleles = self.supportedAlleles
             else:
                 # filter for supported alleles
-                alleles = filter(lambda a: a.name in self.supportedAlleles, alleles)
-
-            result = {}
+                alleles = filter(lambda a: a in self.supportedAlleles, alleles)
 
             # fetch peptides as strings
             peptides = [str(peptide) for peptide in peptides]
@@ -158,9 +156,12 @@ def predict(self, peptides, alleles=None, binary=False, **kwargs):
                 for peptide in peptides:
                     file.write(peptide + "\n")
 
-            # predict bindings
+            alleles = self.convert_alleles(alleles) 
+            result = {}
+            # predict binding affinities
             for a in alleles:
-                result[a] = {}
+                allele_repr = self.revert_allele_repr(a)
+                result[allele_repr] = {}
                 tmp_output_file = tempfile.NamedTemporaryFile().name
                 mhcnuggets_predict(class_='I',
                                    peptides_path=tmp_input_file,
@@ -173,17 +174,18 @@ def predict(self, peptides, alleles=None, binary=False, **kwargs):
                     # skip header
                     reader.next()
 
+                    # assign binding affinities
                     for row in reader:
                         content = row[0].split(',')
                         peptide = content[0]
                         binding_affinity = content[1]
                         if binary:
                             if binding_affinity <= 500:
-                                result[a][peptide] = 1.0
+                                result[allele_repr][peptide] = 1.0
                             else:
-                                result[a][peptide] = 0.0
+                                result[allele_repr][peptide] = 0.0
                         else:
-                            result[a][peptide] = binding_affinity
+                            result[allele_repr][peptide] = binding_affinity
 
             # create EpitopePredictionResult object. This is a multi-indexed DataFrame
             # with Peptide and Method as multi-index and alleles as columns
@@ -264,45 +266,50 @@ def version(self):
             # returns the version of the predictor
             return self.__version
 
-        # the interface defines a function converting Fred2's HLA allele presentation
-        # into an internal presentation used by different methods.
-        # for this predictor we won't need it but still have to provide it!
-        # the function consumes a list of alleles and converts them into the internally used presentation
+        # Converts FRED2s internal allele representation into the format required by mhcnuggets-class-2 
         def convert_alleles(self, alleles):
-            return [allele.replace(':', '').replace('*', '') for allele in alleles]
+            return ['HLA-' + allele.name.replace('*', '') for allele in alleles]
+
+        # Converts the internal mhcnuggets-class-2 representation back into a FRED2 representation
+        def revert_allele_repr(self, allele):
+            allele = allele.replace('HLA-', '')
+            # since we need to support single and double mhc2 alleles
+            allele_split = allele.split('-')
+            if len(allele_split) > 1:
+                return allele_split[0][:4] + '*' + allele_split[0][4:] + '-' + allele_split[1][:4] + '*' + allele_split[1][4:]
+            else:
+                return allele_split[0][:4] + '*' + allele_split[0][4:]
 
-        # additionally the interface defines a function `predict`
-        # that consumes a list of peptides or a single peptide and optionally a list
-        # of allele objects
-        #
-        # this method implements the complete prediction routine
+        # predicts the binding affinity for a set of peptides and alleles
         def predict(self, peptides, alleles=None, binary=False, **kwargs):
 
             # test whether one peptide or a list
-            if isinstance(peptides, basestring):
-                peptides = list(peptides)
+            if not isinstance(peptides, list):
+                peptides = [peptides]
 
             # if no alleles are specified do predictions for all supported alleles
             if alleles is None:
                 alleles = self.supportedAlleles
             else:
                 # filter for supported alleles
-                alleles = filter(lambda a: a.name in self.supportedAlleles, alleles)
-
-            result = {}
+                alleles = filter(lambda a: a in self.supportedAlleles, alleles)
 
             # fetch peptides as strings
             peptides = [str(peptide) for peptide in peptides]
 
+            alleles = self.convert_alleles(alleles)
+
             # write peptides temporarily, new line separated
             tmp_input_file = tempfile.NamedTemporaryFile().name
             with open(tmp_input_file, 'wb') as file:
                 for peptide in peptides:
                     file.write(peptide + "\n")
 
+            result = {}
             # predict bindings
             for a in alleles:
-                result[a] = {}
+                allele_repr = self.revert_allele_repr(a)
+                result[allele_repr] = {}
                 tmp_output_file = tempfile.NamedTemporaryFile().name
 
                 mhcnuggets_predict(class_='II',
@@ -322,11 +329,11 @@ def predict(self, peptides, alleles=None, binary=False, **kwargs):
                         binding_affinity = content[1]
                         if binary:
                             if binding_affinity <= 500:
-                                result[a][peptide] = 1.0
+                                result[allele_repr][peptide] = 1.0
                             else:
-                                result[a][peptide] = 0.0
+                                result[allele_repr][peptide] = 0.0
                         else:
-                            result[a][peptide] = binding_affinity
+                            result[allele_repr][peptide] = binding_affinity
 
             # create EpitopePredictionResult object. This is a multi-indexed DataFrame
             # with Peptide and Method as multi-index and alleles as columns
@@ -386,27 +393,31 @@ def version(self):
             # returns the version of the predictor
             return self.__version
 
-        # the interface defines a function converting Fred2's HLA allele presentation
-        # into an internal presentation used by different methods.
-        # for this predictor we won't need it but still have to provide it!
-        # the function consumes a list of alleles and converts them into the internally used presentation
+        # converts internal FRED2 HLA representations into an internal representation used by MHCFlurry
         def convert_alleles(self, alleles):
-            # we just use the identity function
-            return alleles
-
-        # additionally the interface defines a function `predict`
+            return ['HLA-' + allele.name.replace(':', '').replace('*', '') for allele in alleles]
+
+        # Converts the internal MHCFlurry representation back into a FRED2 representation
+        def revert_allele_repr(self, allele):
+            allele = allele.replace('HLA-', '')
+            allele = allele[:1] + '*' + allele[1:3] + ':' + allele[3:]
+            return allele 
+                    
+        # predicts the binding affinity for a set of peptides and alleles
         def predict(self, peptides, alleles=None, binary=False, **kwargs):
 
             # test whether one peptide or a list
-            if isinstance(peptides, basestring):
-                peptides = list(peptides)
+            if not isinstance(peptides, list):
+                peptides = [peptides]
 
             # if no alleles are specified do predictions for all supported alleles
             if alleles is None:
                 alleles = self.supportedAlleles
             else:
                 # filter for supported alleles
-                alleles = filter(lambda a: a.name in self.supportedAlleles, alleles)
+                alleles = filter(lambda a: a in self.supportedAlleles, alleles)
+
+            alleles = self.convert_alleles(alleles)
 
             # test mhcflurry models are available => download if not
             p = subprocess.Popen(['mhcflurry-downloads', 'path', 'models_class1'],
@@ -417,20 +428,21 @@ def predict(self, peptides, alleles=None, binary=False, **kwargs):
             # load model
             predictor = Class1AffinityPredictor.load()
 
+            # predict and assign binding affinities
             result = {}
-
             for a in alleles:
-                result[a] = {}
+                allele_repr = self.revert_allele_repr(a)
+                result[allele_repr] = {}
                 for p in peptides:
                     seq = p.__str__()
-                    binding_affinity = predictor.predict(allele=a, peptides=[seq])
+                    binding_affinity = predictor.predict(allele=a, peptides=[seq])[0]
                     if binary:
                         if binding_affinity <= 500:
-                            result[a][p] = 1.0
+                            result[allele_repr][p] = 1.0
                         else:
-                            result[a][p] = 0.0
+                            result[allele_repr][p] = 0.0
                     else:
-                        result[a][p] = binding_affinity
+                        result[allele_repr][p] = binding_affinity
 
             # create EpitopePredictionResult object. This is a multi-indexed DataFrame
             # with Peptide and Method as multi-index and alleles as columns