Skip to content

Commit

Permalink
Merge branch 'development' into chore/document_functions
Browse files Browse the repository at this point in the history
  • Loading branch information
picciama committed Sep 10, 2024
2 parents 23ea74a + 071bf39 commit 964b38c
Show file tree
Hide file tree
Showing 19 changed files with 117,353 additions and 214 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
./unit_tests/data/quantification

# Translations
*.mo
Expand Down
4 changes: 4 additions & 0 deletions docs/_static/custom_cookietemple.css
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ table.align-default {
padding-left: 50px;
}

.rescore-config-table tbody tr:last-child td:first-child {
padding-left: 50px;
}

.date {
font-size: 50%;
}
10 changes: 8 additions & 2 deletions docs/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ Applicable to rescoring
-----------------------

.. table::
:class: fixed-table
:class: fixed-table rescore-config-rable

+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Parameter | Description |
Expand All @@ -82,6 +82,12 @@ Applicable to rescoring
+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| add_feature_cols | Additional columns to be used as percolator/mokapot input features; Can be "all" for all additional columns in provided internal search results or a list of column names; default = "none" |
+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| quantification | (Optional) If True, run picked-group-FDR for quantification. This also requires in-silico digestion options (see "Applicable to in-silico digestion") and a fasta input. |
+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| inputs | Contains information about the fasta file (only needed if quantification is True). |
+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| library_input | Path to fasta file for in-silico digestion (also see the required parameters for in-silico digestion above) |
+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Applicable to spectral library generation
-----------------------------------------
Expand Down Expand Up @@ -139,4 +145,4 @@ Applicable to in-silico digestion
| specialAas | Special amino acids for decoy generation; default = "KR" |
+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| db | Defines whether the digestion should contain only targets, only decoys or both (concatenated); can be "target", "decoy" or "concat"; default = "concat" |
+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
50 changes: 50 additions & 0 deletions docs/jobs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,54 @@ The example config can be loaded and viewed using
import oktoberfest as ok
import json
config = ok.utils.example_configs.RESCORING
json.dumps(config, indent=4)
For rescoring tasks including quantification via picked-group-FDR, create a config file like this (so far only MaxQuant is supported):

.. code-block:: json
{
"type": "Rescoring",
"quantification": true,
"tag": "",
"inputs": {
"search_results": "mq_results/txt",
"search_results_type": "Maxquant",
"spectra": "./",
"spectra_type": "raw",
"library_input": "uniprot.fasta"
},
"output": "./out",
"models": {
"intensity": "Prosit_2020_intensity_HCD",
"irt": "Prosit_2019_irt"
},
"prediction_server": "koina.proteomicsdb.org:443",
"ssl": true,
"thermoExe": "/opt/compomics/ThermoRawFileParser1.4.3/ThermoRawFileParser.exe",
"numThreads": 1,
"fdr_estimation_method": "percolator",
"regressionMethod": "spline",
"massTolerance": 20,
"unitMassTolerance": "ppm",
"fastaDigestOptions": {
"digestion": "full",
"missedCleavages": 2,
"minLength": 7,
"maxLength": 60,
"enzyme": "trypsin",
"specialAas": "KR",
"db": "concat"
}
}
The example config can be loaded and viewed using

.. code-block:: python
import oktoberfest as ok
import json
config = ok.utils.example_configs.RESCORING_WITH_QUANT
json.dumps(config, indent=4)
27 changes: 16 additions & 11 deletions oktoberfest/data/spectra.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import annotations

import logging
from enum import Enum
from pathlib import Path
from typing import List, Optional, Tuple, Type, TypeVar, Union
from typing import TYPE_CHECKING, TypeVar

import anndata
import numpy as np
Expand All @@ -10,6 +12,9 @@
import spectrum_fundamentals.constants as c
from scipy.sparse import csr_matrix, dok_matrix

if TYPE_CHECKING:
from anndata.compat import Index

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -37,7 +42,7 @@ class Spectra(anndata.AnnData):
MAX_CHARGE = 3

@staticmethod
def _gen_vars_df(specified_ion_types: Optional[List[str]] = None) -> pd.DataFrame:
def _gen_vars_df(specified_ion_types: list[str] | None = None) -> pd.DataFrame:
"""
Creates Annotation dataframe for vars in AnnData object.
Expand All @@ -61,7 +66,7 @@ def _gen_vars_df(specified_ion_types: Optional[List[str]] = None) -> pd.DataFram
return var_df

@staticmethod
def _gen_column_names(fragment_type: FragmentType) -> List[str]:
def _gen_column_names(fragment_type: FragmentType) -> list[str]:
"""
Get column names of the spectra data.
Expand Down Expand Up @@ -108,12 +113,12 @@ def _resolve_layer_name(fragment_type: FragmentType) -> str:
layer = Spectra.MZ_LAYER_NAME
return layer

def __getitem__(self, index: anndata._core.index.Index):
def __getitem__(self, index: Index):
"""Returns a sliced view of the object with this type to avoid returning AnnData instances when slicing."""
oidx, vidx = self._normalize_indices(index)
return Spectra(self, oidx=oidx, vidx=vidx, asview=True)

def add_column(self, data: Union[np.ndarray, pd.Series], name: Optional[str] = None) -> None:
def add_column(self, data: np.ndarray | pd.Series, name: str | None = None) -> None:
"""
Add column to spectra data.
Expand Down Expand Up @@ -192,9 +197,9 @@ def add_intensities(self, intensities: np.ndarray, annotation: np.ndarray, fragm

def add_list_of_predicted_intensities(
self,
intensities: List[np.ndarray],
annotations: List[np.ndarray],
chunk_indices: List[np.ndarray],
intensities: list[np.ndarray],
annotations: list[np.ndarray],
chunk_indices: list[np.ndarray],
):
"""
Add chunks of predicted intensities and convert to sparse matrix.
Expand Down Expand Up @@ -253,7 +258,7 @@ def _add_predicted_intensites(

# self.obs.iloc[index]["done"] = True

def get_matrix(self, fragment_type: FragmentType) -> Tuple[csr_matrix, List[str]]:
def get_matrix(self, fragment_type: FragmentType) -> tuple[csr_matrix, list[str]]:
"""
Get intensities sparse matrix from AnnData object.
Expand All @@ -268,7 +273,7 @@ def get_matrix(self, fragment_type: FragmentType) -> Tuple[csr_matrix, List[str]

return matrix, self._gen_column_names(fragment_type)

def write_as_hdf5(self, output_file: Union[str, Path]):
def write_as_hdf5(self, output_file: str | Path):
"""
Write spectra_data to hdf5 file.
Expand All @@ -277,7 +282,7 @@ def write_as_hdf5(self, output_file: Union[str, Path]):
self.write(output_file, compression="gzip")

@classmethod
def from_hdf5(cls: Type[SpectraT], input_file: Union[str, Path]) -> SpectraT:
def from_hdf5(cls: type[SpectraT], input_file: str | Path) -> SpectraT:
"""
Read from hdf5 file.
Expand Down
9 changes: 7 additions & 2 deletions oktoberfest/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from oktoberfest import rescore as re

from .data.spectra import Spectra
from .utils import Config, JobPool, ProcessStep, group_iterator
from .utils import Config, JobPool, ProcessStep, apply_quant, group_iterator

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -655,9 +655,14 @@ def run_rescoring(config_path: Union[str, Path]):
# plotting
logger.info("Generating summary plots...")
pl.plot_all(fdr_dir)

logger.info("Finished rescoring.")

if config.quantification:
logger.info("Starting quantification")
# method contains picked-group-FDR call
apply_quant(config)
logger.info("Finished quantification")


def run_job(config_path: Union[str, Path]):
"""
Expand Down
1 change: 1 addition & 0 deletions oktoberfest/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .groupiterator import group_iterator
from .multiprocessing_pool import JobPool
from .process_step import ProcessStep
from .quantification import apply_quant
57 changes: 57 additions & 0 deletions oktoberfest/utils/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import os
from pathlib import Path
from sys import platform
from typing import Dict, List, Optional, Tuple, Union
Expand Down Expand Up @@ -58,6 +59,11 @@ def job_type(self) -> str:
else:
raise ValueError("No job type specified in config file.")

@property
def quantification(self) -> bool:
"""Get quantification flag for performing quantification using picked-group-fdr."""
return self.data.get("quantification", False)

@property
def mass_tolerance(self) -> Optional[float]:
"""Get mass tolerance value from the config file with which to caluculate the min and max mass values."""
Expand Down Expand Up @@ -377,6 +383,10 @@ def check(self):
if self.job_type == "SpectralLibraryGeneration":
self._check_for_speclib()

if self.quantification:
self._check_quantification()
self._check_fasta()

if "alphapept" in int_model:
instrument_type = self.instrument_type
valid_alphapept_instrument_types = ["QE", "LUMOS", "TIMSTOF", "SCIEXTOF"]
Expand Down Expand Up @@ -424,6 +434,53 @@ def _check_for_speclib(self):
f"{instrument_type}. Provide one of {valid_alphapept_instrument_types}."
)

def _find_file_in_subd(self, directory: Path, filename: str):
for _, _, files in os.walk(directory):
if filename in files:
return True
return False

def _check_quantification(self):
if Path(self.search_results).is_file():
path_stem = Path(self.search_results).parent
else:
path_stem = Path(self.search_results)

if self.search_results_type == "maxquant" and not Path(path_stem / "evidence.txt").is_file():
raise AssertionError(
f"You specified the search results as {self.search_results_type} but evidence.txt is not available "
f"at {path_stem / 'evidence.txt'}."
)
elif self.search_results_type == "sage":
if not Path(path_stem / "results.sage.tsv").is_file():
raise AssertionError(
f"You specified the search results as {self.search_results_type} for quantification, but "
f"results.sage.tsv is not available at {path_stem / 'results.sage.tsv'}."
)
elif not Path(path_stem / "lfq.tsv").is_file():
raise AssertionError(
f"You specified the search results as {self.search_results_type} for quantification, but "
f"lfq.tsv is not available at {path_stem / 'lfq.tsv'}."
)
elif self.search_results_type == "msfragger":
if not self._find_file_in_subd(path_stem, "psm.tsv"):
raise AssertionError(
f"You specified the search results as {self.search_results_type} for quantification, but "
"no psm.tsv files could be found in subdirectories."
)
elif not Path(path_stem / "combined_ion.tsv").is_file():
raise AssertionError(
f"You specified the search results as {self.search_results_type} for quantification, but "
f"combined_ion.tsv is not available at {path_stem / 'combined_ion.tsv'}."
)

def _check_fasta(self):
if not self.library_input_type.lower() == "fasta":
raise AssertionError(
f"The specified library input type is set to {self.library_input_type}. "
"For quantification a fasta file is needed."
)

def __init__(self):
"""Initialize config file data."""
self.data = {}
Expand Down
36 changes: 36 additions & 0 deletions oktoberfest/utils/example_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,42 @@
},
}

RESCORING_WITH_QUANT = {
"type": "Rescoring",
"quantification": True,
"tag": "",
"inputs": {
"search_results": "mq_results/txt",
"search_results_type": "Maxquant",
"spectra": "./",
"spectra_type": "raw",
"library_input": "uniprot.fasta",
},
"output": "./out",
"models": {
"intensity": "Prosit_2020_intensity_HCD",
"irt": "Prosit_2019_irt",
},
"prediction_server": "koina.proteomicsdb.org:443",
"ssl": True,
"thermoExe": "/opt/compomics/ThermoRawFileParser1.4.3/ThermoRawFileParser.exe",
"numThreads": 1,
"fdr_estimation_method": "percolator",
"regressionMethod": "spline",
"allFeatures": False,
"massTolerance": 20,
"unitMassTolerance": "ppm",
"fastaDigestOptions": {
"digestion": "full",
"missedCleavages": 2,
"minLength": 7,
"maxLength": 60,
"enzyme": "trypsin",
"specialAas": "KR",
"db": "concat",
},
}

CECALIB = {
"type": "CollisionEnergyCalibration",
"tag": "",
Expand Down
Loading

0 comments on commit 964b38c

Please sign in to comment.