Merge branch 'development' into chore/document_functions

wilhelm-lab · Sep 10, 2024 · 964b38c · 964b38c
2 parents 23ea74a + 071bf39
commit 964b38c
Show file tree

Hide file tree

Showing 19 changed files with 117,353 additions and 214 deletions.
diff --git a/.gitignore b/.gitignore
@@ -51,6 +51,7 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+./unit_tests/data/quantification
 
 # Translations
 *.mo

diff --git a/docs/_static/custom_cookietemple.css b/docs/_static/custom_cookietemple.css
@@ -75,6 +75,10 @@ table.align-default {
     padding-left: 50px;
 }
 
+.rescore-config-table tbody tr:last-child td:first-child {
+    padding-left: 50px;
+}
+
 .date {
     font-size: 50%;
 }
diff --git a/docs/config.rst b/docs/config.rst
@@ -71,7 +71,7 @@ Applicable to rescoring
 -----------------------
 
 .. table::
-   :class: fixed-table
+   :class: fixed-table rescore-config-rable
 
    +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    | Parameter                  |                             Description                                                                                                                                                       |
@@ -82,6 +82,12 @@ Applicable to rescoring
    +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    | add_feature_cols           | Additional columns to be used as percolator/mokapot input features; Can be "all" for all additional columns in provided internal search results or a list of column names; default = "none"   |
    +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   | quantification             | (Optional) If True, run picked-group-FDR for quantification. This also requires in-silico digestion options (see "Applicable to in-silico digestion") and a fasta input.                      |
+   +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   | inputs                     | Contains information about the fasta file (only needed if quantification is True).                                                                                                            |
+   +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |     library_input          | Path to fasta file for in-silico digestion (also see the required parameters for in-silico digestion above)                                                                                   |
+   +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 Applicable to spectral library generation
 -----------------------------------------
@@ -139,4 +145,4 @@ Applicable to in-silico digestion
    |     specialAas             | Special amino acids for decoy generation; default = "KR"                                                                                                           |
    +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |     db                     | Defines whether the digestion should contain only targets, only decoys or both (concatenated); can be "target", "decoy" or "concat"; default = "concat"            |
-   +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
diff --git a/docs/jobs.rst b/docs/jobs.rst
@@ -214,4 +214,54 @@ The example config can be loaded and viewed using
     import oktoberfest as ok
     import json
     config = ok.utils.example_configs.RESCORING
+    json.dumps(config, indent=4)
+
+
+For rescoring tasks including quantification via picked-group-FDR, create a config file like this (so far only MaxQuant is supported):
+
+.. code-block:: json
+
+    {
+        "type": "Rescoring",
+        "quantification": true,
+        "tag": "",
+        "inputs": {
+            "search_results": "mq_results/txt",
+            "search_results_type": "Maxquant",
+            "spectra": "./",
+            "spectra_type": "raw",
+            "library_input": "uniprot.fasta"
+        },
+        "output": "./out",
+        "models": {
+            "intensity": "Prosit_2020_intensity_HCD",
+            "irt": "Prosit_2019_irt"
+        },
+        "prediction_server": "koina.proteomicsdb.org:443",
+        "ssl": true,
+        "thermoExe": "/opt/compomics/ThermoRawFileParser1.4.3/ThermoRawFileParser.exe",
+        "numThreads": 1,
+        "fdr_estimation_method": "percolator",
+        "regressionMethod": "spline",
+        "massTolerance": 20,
+        "unitMassTolerance": "ppm",
+        "fastaDigestOptions": {
+            "digestion": "full",
+            "missedCleavages": 2,
+            "minLength": 7,
+            "maxLength": 60,
+            "enzyme": "trypsin",
+            "specialAas": "KR",
+            "db": "concat"
+        }
+    }
+
+
+The example config can be loaded and viewed using
+
+.. code-block:: python
+
+    import oktoberfest as ok
+    import json
+    config = ok.utils.example_configs.RESCORING_WITH_QUANT
     json.dumps(config, indent=4)
diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py
@@ -1,7 +1,9 @@
+from __future__ import annotations
+
 import logging
 from enum import Enum
 from pathlib import Path
-from typing import List, Optional, Tuple, Type, TypeVar, Union
+from typing import TYPE_CHECKING, TypeVar
 
 import anndata
 import numpy as np
@@ -10,6 +12,9 @@
 import spectrum_fundamentals.constants as c
 from scipy.sparse import csr_matrix, dok_matrix
 
+if TYPE_CHECKING:
+    from anndata.compat import Index
+
 logger = logging.getLogger(__name__)
 
 
@@ -37,7 +42,7 @@ class Spectra(anndata.AnnData):
     MAX_CHARGE = 3
 
     @staticmethod
-    def _gen_vars_df(specified_ion_types: Optional[List[str]] = None) -> pd.DataFrame:
+    def _gen_vars_df(specified_ion_types: list[str] | None = None) -> pd.DataFrame:
         """
         Creates Annotation dataframe for vars in AnnData object.
 
@@ -61,7 +66,7 @@ def _gen_vars_df(specified_ion_types: Optional[List[str]] = None) -> pd.DataFram
         return var_df
 
     @staticmethod
-    def _gen_column_names(fragment_type: FragmentType) -> List[str]:
+    def _gen_column_names(fragment_type: FragmentType) -> list[str]:
         """
         Get column names of the spectra data.
 
@@ -108,12 +113,12 @@ def _resolve_layer_name(fragment_type: FragmentType) -> str:
             layer = Spectra.MZ_LAYER_NAME
         return layer
 
-    def __getitem__(self, index: anndata._core.index.Index):
+    def __getitem__(self, index: Index):
         """Returns a sliced view of the object with this type to avoid returning AnnData instances when slicing."""
         oidx, vidx = self._normalize_indices(index)
         return Spectra(self, oidx=oidx, vidx=vidx, asview=True)
 
-    def add_column(self, data: Union[np.ndarray, pd.Series], name: Optional[str] = None) -> None:
+    def add_column(self, data: np.ndarray | pd.Series, name: str | None = None) -> None:
         """
         Add column to spectra data.
 
@@ -192,9 +197,9 @@ def add_intensities(self, intensities: np.ndarray, annotation: np.ndarray, fragm
 
     def add_list_of_predicted_intensities(
         self,
-        intensities: List[np.ndarray],
-        annotations: List[np.ndarray],
-        chunk_indices: List[np.ndarray],
+        intensities: list[np.ndarray],
+        annotations: list[np.ndarray],
+        chunk_indices: list[np.ndarray],
     ):
         """
         Add chunks of predicted intensities and convert to sparse matrix.
@@ -253,7 +258,7 @@ def _add_predicted_intensites(
 
         # self.obs.iloc[index]["done"] = True
 
-    def get_matrix(self, fragment_type: FragmentType) -> Tuple[csr_matrix, List[str]]:
+    def get_matrix(self, fragment_type: FragmentType) -> tuple[csr_matrix, list[str]]:
         """
         Get intensities sparse matrix from AnnData object.
 
@@ -268,7 +273,7 @@ def get_matrix(self, fragment_type: FragmentType) -> Tuple[csr_matrix, List[str]
 
         return matrix, self._gen_column_names(fragment_type)
 
-    def write_as_hdf5(self, output_file: Union[str, Path]):
+    def write_as_hdf5(self, output_file: str | Path):
         """
         Write spectra_data to hdf5 file.
 
@@ -277,7 +282,7 @@ def write_as_hdf5(self, output_file: Union[str, Path]):
         self.write(output_file, compression="gzip")
 
     @classmethod
-    def from_hdf5(cls: Type[SpectraT], input_file: Union[str, Path]) -> SpectraT:
+    def from_hdf5(cls: type[SpectraT], input_file: str | Path) -> SpectraT:
         """
         Read from hdf5 file.
 

diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py
@@ -23,7 +23,7 @@
 from oktoberfest import rescore as re
 
 from .data.spectra import Spectra
-from .utils import Config, JobPool, ProcessStep, group_iterator
+from .utils import Config, JobPool, ProcessStep, apply_quant, group_iterator
 
 logger = logging.getLogger(__name__)
 
@@ -655,9 +655,14 @@ def run_rescoring(config_path: Union[str, Path]):
     # plotting
     logger.info("Generating summary plots...")
     pl.plot_all(fdr_dir)
-
     logger.info("Finished rescoring.")
 
+    if config.quantification:
+        logger.info("Starting quantification")
+        # method contains picked-group-FDR call
+        apply_quant(config)
+        logger.info("Finished quantification")
+
 
 def run_job(config_path: Union[str, Path]):
     """

diff --git a/oktoberfest/utils/__init__.py b/oktoberfest/utils/__init__.py
@@ -5,3 +5,4 @@
 from .groupiterator import group_iterator
 from .multiprocessing_pool import JobPool
 from .process_step import ProcessStep
+from .quantification import apply_quant
diff --git a/oktoberfest/utils/config.py b/oktoberfest/utils/config.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import os
 from pathlib import Path
 from sys import platform
 from typing import Dict, List, Optional, Tuple, Union
@@ -58,6 +59,11 @@ def job_type(self) -> str:
         else:
             raise ValueError("No job type specified in config file.")
 
+    @property
+    def quantification(self) -> bool:
+        """Get quantification flag for performing quantification using picked-group-fdr."""
+        return self.data.get("quantification", False)
+
     @property
     def mass_tolerance(self) -> Optional[float]:
         """Get mass tolerance value from the config file with which to caluculate the min and max mass values."""
@@ -377,6 +383,10 @@ def check(self):
         if self.job_type == "SpectralLibraryGeneration":
             self._check_for_speclib()
 
+        if self.quantification:
+            self._check_quantification()
+            self._check_fasta()
+
         if "alphapept" in int_model:
             instrument_type = self.instrument_type
             valid_alphapept_instrument_types = ["QE", "LUMOS", "TIMSTOF", "SCIEXTOF"]
@@ -424,6 +434,53 @@ def _check_for_speclib(self):
                         f"{instrument_type}. Provide one of {valid_alphapept_instrument_types}."
                     )
 
+    def _find_file_in_subd(self, directory: Path, filename: str):
+        for _, _, files in os.walk(directory):
+            if filename in files:
+                return True
+        return False
+
+    def _check_quantification(self):
+        if Path(self.search_results).is_file():
+            path_stem = Path(self.search_results).parent
+        else:
+            path_stem = Path(self.search_results)
+
+        if self.search_results_type == "maxquant" and not Path(path_stem / "evidence.txt").is_file():
+            raise AssertionError(
+                f"You specified the search results as {self.search_results_type} but evidence.txt is not available "
+                f"at {path_stem / 'evidence.txt'}."
+            )
+        elif self.search_results_type == "sage":
+            if not Path(path_stem / "results.sage.tsv").is_file():
+                raise AssertionError(
+                    f"You specified the search results as {self.search_results_type} for quantification, but "
+                    f"results.sage.tsv is not available at {path_stem / 'results.sage.tsv'}."
+                )
+            elif not Path(path_stem / "lfq.tsv").is_file():
+                raise AssertionError(
+                    f"You specified the search results as {self.search_results_type} for quantification, but "
+                    f"lfq.tsv is not available at {path_stem / 'lfq.tsv'}."
+                )
+        elif self.search_results_type == "msfragger":
+            if not self._find_file_in_subd(path_stem, "psm.tsv"):
+                raise AssertionError(
+                    f"You specified the search results as {self.search_results_type} for quantification, but "
+                    "no psm.tsv files could be found in subdirectories."
+                )
+            elif not Path(path_stem / "combined_ion.tsv").is_file():
+                raise AssertionError(
+                    f"You specified the search results as {self.search_results_type} for quantification, but "
+                    f"combined_ion.tsv is not available  at {path_stem / 'combined_ion.tsv'}."
+                )
+
+    def _check_fasta(self):
+        if not self.library_input_type.lower() == "fasta":
+            raise AssertionError(
+                f"The specified library input type is set to {self.library_input_type}. "
+                "For quantification a fasta file is needed."
+            )
+
     def __init__(self):
         """Initialize config file data."""
         self.data = {}

diff --git a/oktoberfest/utils/example_configs.py b/oktoberfest/utils/example_configs.py
@@ -25,6 +25,42 @@
     },
 }
 
+RESCORING_WITH_QUANT = {
+    "type": "Rescoring",
+    "quantification": True,
+    "tag": "",
+    "inputs": {
+        "search_results": "mq_results/txt",
+        "search_results_type": "Maxquant",
+        "spectra": "./",
+        "spectra_type": "raw",
+        "library_input": "uniprot.fasta",
+    },
+    "output": "./out",
+    "models": {
+        "intensity": "Prosit_2020_intensity_HCD",
+        "irt": "Prosit_2019_irt",
+    },
+    "prediction_server": "koina.proteomicsdb.org:443",
+    "ssl": True,
+    "thermoExe": "/opt/compomics/ThermoRawFileParser1.4.3/ThermoRawFileParser.exe",
+    "numThreads": 1,
+    "fdr_estimation_method": "percolator",
+    "regressionMethod": "spline",
+    "allFeatures": False,
+    "massTolerance": 20,
+    "unitMassTolerance": "ppm",
+    "fastaDigestOptions": {
+        "digestion": "full",
+        "missedCleavages": 2,
+        "minLength": 7,
+        "maxLength": 60,
+        "enzyme": "trypsin",
+        "specialAas": "KR",
+        "db": "concat",
+    },
+}
+
 CECALIB = {
     "type": "CollisionEnergyCalibration",
     "tag": "",