Merge pull request #46 from wilhelm-lab/prosit_2023_intensity_tof

Add Prosit 2023 intensity TOF model.
wilhelm-lab · Aug 3, 2023 · 701e8f0 · 701e8f0
2 parents 8b550bc + 0314389
commit 701e8f0
Show file tree

Hide file tree

Showing 19 changed files with 651 additions and 71 deletions.
diff --git a/models/Prosit/Prosit_2023_intensity_TOF/1/.donotdelete b/models/Prosit/Prosit_2023_intensity_TOF/1/.donotdelete
diff --git a/models/Prosit/Prosit_2023_intensity_TOF/config.pbtxt b/models/Prosit/Prosit_2023_intensity_TOF/config.pbtxt
@@ -0,0 +1,133 @@
+max_batch_size: 1000
+platform: "ensemble"
+input [
+  {
+   name: 'peptide_sequences',
+   data_type: TYPE_STRING,
+   dims: [-1]
+  },
+  {
+    name: 'precursor_charges',
+    data_type: TYPE_INT32,
+    dims: [1],
+  },
+  {
+    name: 'collision_energies',
+    data_type: TYPE_FP32,
+    dims: [1],
+  }
+]
+output [
+  {
+   name: 'intensities',
+   data_type: TYPE_FP32,
+   dims: [174]
+  },
+ {
+   name: 'mz',
+   data_type: TYPE_FP32,
+   dims: [174]
+ },
+ {
+   name: 'annotation',
+   data_type: TYPE_STRING,
+   dims: [174]
+ }
+]
+
+ensemble_scheduling {
+  step [
+     {
+      model_name: "Prosit_Preprocess_charge"
+      model_version: 1
+      input_map {
+        key: "precursor_charges"
+        value: "precursor_charges"
+      },
+      output_map {
+        key: "precursor_charges_in:0"
+        value: "precursor_charges_in_preprocessed:0"
+      }
+    },
+    {
+      model_name: "Prosit_Preprocess_peptide"
+      model_version: 1
+      input_map {
+        key: "peptide_sequences"
+        value: "peptide_sequences"
+      },
+      output_map {
+        key: "peptides_in:0"
+        value: "peptides_in:0"
+      }
+    },
+    {
+      model_name: "Prosit_Preprocess_collision_energy"
+      model_version: 1
+      input_map {
+        key: "raw_collision_energy"
+        value: "collision_energies"
+      },
+      output_map {
+        key: "norm_collision_energy"
+        value: "norm_collision_energy"
+      }
+    },
+    {
+      model_name: "Prosit_2023_intensity_TOF_core"
+      model_version: 1
+      input_map {
+        key: "peptides_in"
+        value: "peptides_in:0"
+      },
+      input_map {
+        key: "collision_energy_in"
+        value: "norm_collision_energy"
+      },
+      input_map {
+        key: "precursor_charge_in"
+        value: "precursor_charges_in_preprocessed:0"
+      }
+      output_map {
+        key: "out"
+        value: "out/Reshape:0"
+      }
+    },
+    {
+      model_name: "Prosit_2019_intensity_postprocess"
+      model_version: 1
+      input_map {
+        key: "peptides_in:0"
+        value: "peptide_sequences"
+      },
+      input_map{
+        key: "precursor_charges_in:0"
+        value: "precursor_charges_in_preprocessed:0"
+      }
+      input_map{
+        key: "peaks_in:0",
+        value: "out/Reshape:0"
+      }
+      output_map {
+        key: "intensities"
+        value: "intensities"
+      }
+      output_map {
+        key: "mz"
+        value: "mz"
+      }
+    },
+    {
+      model_name: "Prosit_Helper_annotation"
+      model_version: 1
+      input_map {
+        key: "precursor_charges"
+        value: "precursor_charges"
+      },
+      output_map {
+        key: "annotation"
+        value: "annotation"
+      }
+    }
+  ]
+}
diff --git a/models/Prosit/Prosit_2023_intensity_TOF/notes.yaml b/models/Prosit/Prosit_2023_intensity_TOF/notes.yaml
@@ -0,0 +1,37 @@
+description: |
+  The HCD Prosit 2020 model was fine-tuned using 277,781 MS/MS spectra of both tryptic and non-tryptic synthesized peptides measured on a timsTOF Pro. The model architecture remained unchanged. The data was split into three distinct sets with each peptide and subsequence of a peptide only included in one of the three: training (80%, 153,809 tryptic PSMs and 77,577 non-tryptic PSMs), validation (10%, 16,483 tryptic PSMs and 7,778 non-tryptic PSMs), and test (10%, 14,262 tryptic PSMs and 7,872 non-tryptic PSMs).
+  
+  For this project, over 300,000 non-tryptic peptides from the ProteomeTools project were measured. Our measurements encompassed a range of collision energies from 20.81 EV to 69.77 eV. The data was analyzed using MaxQuant version 2.1.2.0 with carbamidomethylated cysteine specified as a fixed modification and methionine oxidation as a variable modification.
+  
+  The HCD Prosit 2020 model was originally trained on approximately 30 million MS/MS spectra, consisting of 9 million MS/MS spectra of non-tryptic peptides and 21 million previously published tryptic MS/MS spectra. The comparison between the HCD Prosit 2020 model and the newly developed TOF Prosit 2023 model reveals a substantial improvement in normalized spectral contrast angle (SA) between predicted and experimental timsTOF MS/MS spectra for both non-tryptic and tryptic peptides. The TOF Prosit 2023 model achieved a SA ≥ 0.9 for 26.3% of non-tryptic spectra (compared to 2.4% with HCD Prosit 2020) and 42.1% of tryptic spectra (compared to 0.2% with HCD Prosit 2020).
+  
+  The TOF Prosit 2023 model demonstrates consistent performance across different precursor charges, peptide lengths, and collision energies, with minimal bias towards C- and N-terminal amino acids. Both the tryptic and non-tryptic timsTOF data are available via PRIDE, with the identifiers PXD019086 and PXD043844, respectively.
+
+citation: |
+  Fragment ion intensity prediction improves the identification rate of non-tryptic peptides in TimsTOF
+  Charlotte Adams, Wassim Gabriel, Kris Laukens, Mathias Wilhelm, Wout Bittremieux, Kurt Boonen
+  bioRxiv 2023.07.17.549401; doi: https://doi.org/10.1101/2023.07.17.549401 
+tag: "Intensity"
+tag_url: "https://www.proteomicsdb.org/"
+examples:
+  inputs:
+    [
+      {
+          "name": "peptide_sequences",
+          "httpdtype": "BYTES",
+          "shape": "[2,1]",
+          "data": '["AAAAAKAK", "AAAAAKAK"]'
+      },
+      {
+          "name": "precursor_charges",
+          "httpdtype": "INT32",
+          "shape": "[2,1]",
+          "data": '[1,2]'
+      },
+      {
+          "name": "collision_energies",
+          "httpdtype": "FP32",
+          "shape": "[2,1]",
+          "data": '[25, 25]'
+      }
+    ]
diff --git a/models/Prosit/Prosit_2023_intensity_TOF_core/1/.zenodo b/models/Prosit/Prosit_2023_intensity_TOF_core/1/.zenodo
@@ -0,0 +1 @@
+https://zenodo.org/record/8211811/files/model.savedmodel.zip?download=1
diff --git a/models/Prosit/Prosit_2023_intensity_TOF_core/config.pbtxt b/models/Prosit/Prosit_2023_intensity_TOF_core/config.pbtxt
@@ -0,0 +1 @@
+max_batch_size: 1000
diff --git a/test/Prosit/arr_Prosit_2023_intensity_TOF_ce.npy b/test/Prosit/arr_Prosit_2023_intensity_TOF_ce.npy
diff --git a/test/Prosit/arr_Prosit_2023_intensity_TOF_ce_norm.npy b/test/Prosit/arr_Prosit_2023_intensity_TOF_ce_norm.npy
diff --git a/test/Prosit/arr_Prosit_2023_intensity_TOF_charge.npy b/test/Prosit/arr_Prosit_2023_intensity_TOF_charge.npy
diff --git a/test/Prosit/arr_Prosit_2023_intensity_TOF_charge_onehot.npy b/test/Prosit/arr_Prosit_2023_intensity_TOF_charge_onehot.npy
diff --git a/test/Prosit/arr_Prosit_2023_intensity_TOF_int.npy b/test/Prosit/arr_Prosit_2023_intensity_TOF_int.npy
diff --git a/test/Prosit/arr_Prosit_2023_intensity_TOF_int_raw.npy b/test/Prosit/arr_Prosit_2023_intensity_TOF_int_raw.npy
diff --git a/test/Prosit/arr_Prosit_2023_intensity_TOF_seq.npy b/test/Prosit/arr_Prosit_2023_intensity_TOF_seq.npy
diff --git a/test/Prosit/arr_Prosit_2023_intensity_TOF_seq_encoding.npy b/test/Prosit/arr_Prosit_2023_intensity_TOF_seq_encoding.npy
diff --git a/test/Prosit/test_Prosit_2023_intensity_TOF.py b/test/Prosit/test_Prosit_2023_intensity_TOF.py
@@ -0,0 +1,64 @@
+from test.server_config import SERVER_GRPC, SERVER_HTTP
+import tritonclient.grpc as grpcclient
+import numpy as np
+from pathlib import Path
+import requests
+
+# To ensure MODEL_NAME == test_<filename>.py
+MODEL_NAME = Path(__file__).stem.replace("test_", "")
+
+
+def test_available_http():
+    req = requests.get(f"{SERVER_HTTP}/v2/models/{MODEL_NAME}", timeout=1)
+    assert req.status_code == 200
+
+
+def test_available_grpc():
+    triton_client = grpcclient.InferenceServerClient(url=SERVER_GRPC)
+    assert triton_client.is_model_ready(MODEL_NAME)
+
+
+def test_inference():
+    SEQUENCES = np.load(
+        "test/Prosit/arr_Prosit_2023_intensity_TOF_seq.npy", allow_pickle=True
+    )
+    charge = np.load("test/Prosit/arr_Prosit_2023_intensity_TOF_charge.npy")
+    ces = np.load("test/Prosit/arr_Prosit_2023_intensity_TOF_ce.npy")
+
+    triton_client = grpcclient.InferenceServerClient(url=SERVER_GRPC)
+
+    in_pep_seq = grpcclient.InferInput("peptide_sequences", SEQUENCES.shape, "BYTES")
+    in_pep_seq.set_data_from_numpy(SEQUENCES)
+
+    in_charge = grpcclient.InferInput("precursor_charges", charge.shape, "INT32")
+    in_charge.set_data_from_numpy(charge)
+
+    in_ces = grpcclient.InferInput("collision_energies", ces.shape, "FP32")
+    in_ces.set_data_from_numpy(ces)
+
+    result = triton_client.infer(
+        MODEL_NAME,
+        inputs=[in_pep_seq, in_charge, in_ces],
+        outputs=[
+            grpcclient.InferRequestedOutput("intensities"),
+            grpcclient.InferRequestedOutput("mz"),
+            grpcclient.InferRequestedOutput("annotation"),
+        ],
+    )
+
+    intensities = result.as_numpy("intensities")
+    fragmentmz = result.as_numpy("mz")
+    annotation = result.as_numpy("annotation")
+
+    assert intensities.shape == (SEQUENCES.shape[0], 174)
+    assert fragmentmz.shape == (SEQUENCES.shape[0], 174)
+    assert annotation.shape == (SEQUENCES.shape[0], 174)
+
+    # Assert intensities consistent
+    assert np.allclose(
+        intensities,
+        np.load("test/Prosit/arr_Prosit_2023_intensity_TOF_int.npy"),
+        rtol=0,
+        atol=1e-5,
+        equal_nan=True,
+    )
diff --git a/test/Prosit/test_Prosit_2023_intensity_TOF_core.py b/test/Prosit/test_Prosit_2023_intensity_TOF_core.py
@@ -0,0 +1,52 @@
+from test.server_config import SERVER_GRPC, SERVER_HTTP
+import tritonclient.grpc as grpcclient
+import numpy as np
+from pathlib import Path
+import requests
+
+# To ensure MODEL_NAME == test_<filename>.py
+MODEL_NAME = Path(__file__).stem.replace("test_", "")
+
+
+def test_available_http():
+    req = requests.get(f"{SERVER_HTTP}/v2/models/{MODEL_NAME}", timeout=1)
+    assert req.status_code == 200
+
+
+def test_available_grpc():
+    triton_client = grpcclient.InferenceServerClient(url=SERVER_GRPC)
+    assert triton_client.is_model_ready(MODEL_NAME)
+
+
+def test_inference():
+    seq = np.load("test/Prosit/arr_Prosit_2023_intensity_TOF_seq_encoding.npy")
+    charge = np.load("test/Prosit/arr_Prosit_2023_intensity_TOF_charge_onehot.npy")
+    ces = np.load("test/Prosit/arr_Prosit_2023_intensity_TOF_ce_norm.npy")
+
+    triton_client = grpcclient.InferenceServerClient(url=SERVER_GRPC)
+
+    in_pep_seq = grpcclient.InferInput("peptides_in", seq.shape, "INT32")
+    in_pep_seq.set_data_from_numpy(seq)
+
+    in_charge = grpcclient.InferInput("precursor_charge_in", charge.shape, "FP32")
+    in_charge.set_data_from_numpy(charge)
+
+    in_ces = grpcclient.InferInput("collision_energy_in", ces.shape, "FP32")
+    in_ces.set_data_from_numpy(ces)
+
+    result = triton_client.infer(
+        MODEL_NAME,
+        inputs=[in_pep_seq, in_charge, in_ces],
+        outputs=[
+            grpcclient.InferRequestedOutput("out"),
+        ],
+    )
+
+    intensities = result.as_numpy("out")
+
+    assert np.allclose(
+        intensities,
+        np.load("test/Prosit/arr_Prosit_2023_intensity_TOF_int_raw.npy"),
+        rtol=0,
+        atol=1e-4,
+    )
diff --git a/web/README.md b/web/README.md
@@ -3,6 +3,10 @@
 Koina landing page and documentation built with [Nuxt](https://nuxt.com)
 and [RapiDoc](https://rapidocweb.com/index.html)
 
+## OpenAPI
+
+Regenerate the `public/openapi.yaml` by running `./web/openapi/openapi_gen.py`
+
 ## Development Server
 
 Make sure to install the dependencies first

diff --git a/web/openapi/openapi_gen.py b/web/openapi/openapi_gen.py
@@ -57,6 +57,8 @@ def sleep_until_service_starts(http_server):
 
 
 def get_config(http_url, name):
+    # TODO throw an error when the an unknown model is requested
+    # {'error': "Request for unknown model: 'Deeplc_hela_hf' is not found"}
     url = http_url + f"/v2/models/{name}/config"
     logging.info(f"Getting config from:\t\t{url}")
     r = requests.get(url, timeout=1)
@@ -105,6 +107,12 @@ def main(http_url, grpc_url, tmpl_url):
         models[-1]["note"]["description"] = models[-1]["note"]["description"].replace(
             "\n", "<br>"
         )
+        try:
+            models[-1]["note"]["citation"] = models[-1]["note"]["citation"].replace(
+                "\n", "<br>"
+            )
+        except KeyError:
+            logging.warning(f"Model {name} does not contain a citation")
         add_np_and_openapi_dtype(models[-1]["note"])
         copy_outputs_to_note(models[-1])
         verify_inputs(models[-1])

diff --git a/web/openapi/templates/openapi.yml b/web/openapi/templates/openapi.yml
@@ -38,12 +38,9 @@ externalDocs:
 servers:
   - url: {{tmpl_url}}/v2/models
 tags:
-{% for model in models %}
-  - name: {{model.note.tag}}
-    externalDocs:
-      description: Find out more
-      url: {{model.note.tag_url}}
-{% endfor %}
+  - name: Retention Time
+  - name: Intensity
+  - name: Collisional cross section
 paths:
 {% for model in models %}
   /{{model.name}}/infer:
@@ -52,11 +49,24 @@ paths:
         - {{model.note.tag}}
       summary: {{model.name}}
       description: |
+        **Summary**
+
         {{model.note.description }}
+        
+        **Citaton**
+        
+        {{model.note.citation }}
+
+        **Examples**
+
+        <details>
+        <summary>Python using GRPC</summary>
+        
         ```python
         {{model.code}}
         ```
-        "
+
+        </details>
       operationId: {{model.name}}
       requestBody:
         description: Body