From 1e1b10b855a62c106d192f8e726211206b8b372f Mon Sep 17 00:00:00 2001 From: jalew188 Date: Thu, 13 Jun 2024 21:41:31 +0200 Subject: [PATCH 1/4] #53 ADD docs for psm_match.py --- alpharaw/match/psm_match.py | 197 +++++++++++++++++--------- alpharaw/match/psm_match_alphatims.py | 5 +- 2 files changed, 134 insertions(+), 68 deletions(-) diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py index a13de5c..d544389 100644 --- a/alpharaw/match/psm_match.py +++ b/alpharaw/match/psm_match.py @@ -30,6 +30,25 @@ class PepSpecMatch: """ Extract fragment ions from MS2 data. + + Parameters + ---------- + charged_frag_types : list, optional + fragment types with charge states, + e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2']. + If None, it is `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)`. + By default None. + + match_closest : bool, optional + if True, match the closest peak for a m/z; + if False, matched the higest peak for a m/z in the tolerance range. + By default True. + + use_ppm : bool, optional + If use ppm, by default True. + + tol_value : float, optional + tolerance value, by default 20.0 """ match_closest: bool = True @@ -45,25 +64,6 @@ def __init__( use_ppm: bool = True, tol_value: float = 20.0, ): - """ - Parameters - ---------- - charged_frag_types : list, optional - fragment types with charge states, - e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2']. - By default `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)` - - match_closest : bool, optional - if True, match the closest peak for a m/z; - if False, matched the higest peak for a m/z in the tolerance range. - By default True - - use_ppm : bool, optional - If use ppm, by default True - - tol_value : float, optional - tolerance value, by default 20.0 - """ self.charged_frag_types = ( get_charged_frag_types(["b", "y", "b_modloss", "y_modloss"], 2) if charged_frag_types is None @@ -73,17 +73,41 @@ def __init__( self.use_ppm = use_ppm self.tolerance = tol_value - def _preprocess_psms(self, psm_df): - pass + def get_fragment_mz_df(self) -> pd.DataFrame: + """ + Call :func:`alphabase.peptide.fragment.create_fragment_mz_dataframe` + for :attr:`PepSpecMatch.psm_df` and :attr:`PepSpecMatch.charged_frag_types`. - def get_fragment_mz_df(self): + + Returns + ------- + DataFrame + _description_ + """ return create_fragment_mz_dataframe( self.psm_df, self.charged_frag_types, dtype=PEAK_MZ_DTYPE, ) - def _add_missing_columns_to_psm_df(self, psm_df: pd.DataFrame, raw_data=None): + def _add_missing_columns_to_psm_df( + self, psm_df: pd.DataFrame, raw_data: MSData_Base = None + ): + """ + Add missing "rt", "nce", "rt_norm", ("mobility") columns to `psm_df` if missing. + + Parameters + ---------- + psm_df : pd.DataFrame + psm dataframe to be processed. + raw_data : MSData_Base, optional + The `MSData_Base`. If None, `self.raw_data`. by default None. + + Returns + ------- + DataFrame + psm_df inplace. + """ if raw_data is None: raw_data = self.raw_data add_spec_info_list = [] @@ -117,7 +141,19 @@ def _add_missing_columns_to_psm_df(self, psm_df: pd.DataFrame, raw_data=None): # psm_df['rt_sec'] = psm_df.rt*60 return psm_df - def _prepare_matching_dfs(self): + def _prepare_matching_dfs(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Prepare dataframes to be matched. + + Returns + ------- + Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] + pd.DataFrame: fragment mz dataframe. + + pd.DataFrame: intensity dataframe to match. + + pd.DataFrame: mz error dataframe to match. + """ fragment_mz_df = self.get_fragment_mz_df() matched_intensity_df = pd.DataFrame( @@ -138,17 +174,17 @@ def load_ms_data( process_count: int = 8, **kwargs, ): - """Load MS files + """Load MS file to set `self.raw_data`. Parameters ---------- ms_file : str | MSData_Base - ms2 file path + ms2 file path. ms_file_type : str, optional ms2 file type, could be ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"]. - Default to 'alpharaw_hdf' + Default to 'alpharaw_hdf'. """ self.raw_data = load_ms_data(ms_file, ms_file_type, process_count=process_count) @@ -157,18 +193,39 @@ def get_peaks(self, spec_idx: int, **kwargs): def _match_one_psm( self, - spec_mzs: np.ndarray, - spec_intens: np.ndarray, + peak_mzs: np.ndarray, + peak_intens: np.ndarray, fragment_mz_df: pd.DataFrame, matched_intensity_df: pd.DataFrame, matched_mz_err_df: pd.DataFrame, frag_start_idx: int, frag_stop_idx: int, ): - if len(spec_mzs) == 0: + """ + Match fragments of one precursor (located by `frag_start_idx` and `frag_stop_idx`) + against the corresponding `peak_mzs`. + + Parameters + ---------- + peak_mzs : np.ndarray + Peak m/z values to be matched. + peak_intens : np.ndarray + Peak intensities to be matched. + fragment_mz_df : pd.DataFrame + fragment m/z dataframe to be matched. + matched_intensity_df : pd.DataFrame + The dataframe to store matched intensity values. + matched_mz_err_df : pd.DataFrame + The dataframe to store matched mz error values. + frag_start_idx : int + fragment start index of the given PSM. + frag_stop_idx : int + fragment stop index of the given PSM. + """ + if len(peak_mzs) == 0: return - spec_mzs = spec_mzs.astype(PEAK_MZ_DTYPE) + peak_mzs = peak_mzs.astype(PEAK_MZ_DTYPE) frag_mzs = fragment_mz_df.values[frag_start_idx:frag_stop_idx, :] @@ -179,20 +236,20 @@ def _match_one_psm( if self.match_closest: matched_idxes = match_closest_peaks( - spec_mzs, spec_intens, frag_mzs, mz_tols + peak_mzs, peak_intens, frag_mzs, mz_tols ) else: matched_idxes = match_highest_peaks( - spec_mzs, - spec_intens, + peak_mzs, + peak_intens, frag_mzs, mz_tols, ) - matched_intens = spec_intens[matched_idxes] + matched_intens = peak_intens[matched_idxes] matched_intens[matched_idxes == -1] = 0 - matched_mz_errs = np.abs(spec_mzs[matched_idxes] - frag_mzs) + matched_mz_errs = np.abs(peak_mzs[matched_idxes] - frag_mzs) matched_mz_errs[matched_idxes == -1] = np.inf matched_intensity_df.values[frag_start_idx:frag_stop_idx, :] = matched_intens @@ -203,7 +260,7 @@ def match_ms2_one_raw( self, psm_df_one_raw: pd.DataFrame, verbose: bool = False, - ) -> tuple: + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Matching psm_df_one_raw against self.raw_data after `self.load_ms_data()` @@ -216,7 +273,7 @@ def match_ms2_one_raw( Returns ------- - tuple: + Tuple: pd.DataFrame: psm dataframe with fragment index information. pd.DataFrame: fragment mz dataframe. @@ -224,10 +281,8 @@ def match_ms2_one_raw( pd.DataFrame: matched intensity dataframe. pd.DataFrame: matched mass error dataframe. - np.inf if a fragment is not matched. - + np.inf if a fragment is not matched. """ - self._preprocess_psms(psm_df_one_raw) self.psm_df = psm_df_one_raw psm_df_one_raw = self._add_missing_columns_to_psm_df( @@ -307,8 +362,9 @@ def match_ms2_multi_raw( ms_files: Union[dict, list], ms_file_type: str = "alpharaw_hdf", process_num: int = 1, - ): - """Matching PSM dataframe against the ms2 files in ms_files + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Matching PSM dataframe against the ms2 files in ms_files This method will store matched values as attributes: - self.psm_df - self.fragment_mz_df @@ -330,7 +386,7 @@ def match_ms2_multi_raw( Returns ------- - tuple: + Tuple: pd.DataFrame: psm dataframe with fragment index information. pd.DataFrame: fragment mz dataframe. @@ -338,10 +394,9 @@ def match_ms2_multi_raw( pd.DataFrame: matched intensity dataframe. pd.DataFrame: matched mass error dataframe. - np.inf if a fragment is not matched. + np.inf if a fragment is not matched. """ - self._preprocess_psms(psm_df) self.psm_df = psm_df ( @@ -425,7 +480,9 @@ def _prepare_matching_dfs(self): return (fragment_mz_df, matched_intensity_df, matched_mz_err_df) - def _match_ms2_one_raw_numba(self, raw_name, psm_df_one_raw): + def _match_ms2_one_raw_numba( + self, raw_name: str, psm_df_one_raw: pd.DataFrame + ) -> pd.DataFrame: psm_df_one_raw = psm_df_one_raw.reset_index(drop=True) if raw_name in self._ms_file_dict: @@ -485,7 +542,7 @@ def match_ms2_multi_raw( ms_files: Tuple[dict, list], ms_file_type: str = "alpharaw_hdf", process_num: int = 8, - ): + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: if isinstance(ms_files, list): ms_files = parse_ms_files_to_dict(ms_files) psm_df = psm_df[psm_df.raw_name.isin(ms_files)].reset_index(drop=True) @@ -501,18 +558,18 @@ def match_ms2_multi_raw( @numba.jit(nogil=True) def match_one_raw_with_numba( - spec_idxes, - frag_start_idxes, - frag_stop_idxes, - all_frag_mzs, - all_frag_mz_tols, - all_spec_mzs, - all_spec_intensities, - peak_start_idxes, - peak_stop_idxes, - matched_intensities, - matched_mz_errs, - match_closest=True, + spec_idxes: np.ndarray, + frag_start_idxes: np.ndarray, + frag_stop_idxes: np.ndarray, + all_frag_mzs: np.ndarray, + all_frag_mz_tols: np.ndarray, + all_spec_mzs: np.ndarray, + all_spec_intensities: np.ndarray, + peak_start_idxes: np.ndarray, + peak_stop_idxes: np.ndarray, + matched_intensities: np.ndarray, + matched_mz_errs: np.ndarray, + match_closest: bool = True, ): """ Internel function to match fragment mz values to spectrum mz values. @@ -567,7 +624,8 @@ def load_ms_data( ms_file_type: str = "alpharaw_hdf", process_count: int = 8, ) -> MSData_Base: - """Load MS files + """ + Load MS file. Parameters ---------- @@ -575,9 +633,14 @@ def load_ms_data( ms2 file path ms_file_type : str, optional - ms2 file type, could be - ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"]. - Default to 'alpharaw_hdf' + ms2 file type, can be + ["alpharaw_hdf", "thermo", "sciex", "alphapept_hdf", "mgf"]. + Default to 'alpharaw_hdf'. + + Returns + ------- + MSData_Base: + Instance of sub-class of `MSData_Base`. """ if isinstance(ms_file, MSData_Base): return ms_file @@ -600,6 +663,9 @@ def get_best_matched_intens( frag_start_idxes: np.ndarray, frag_stop_idxes: np.ndarray, ): + """ + TODO Deprecated + """ ret_intens = np.zeros( shape=matched_intensity_values.shape[1:], dtype=matched_intensity_values.dtype ) @@ -624,6 +690,9 @@ def get_ion_count_scores( frag_stop_idxes: np.ndarray, min_mz: float = 200, ): + """ + TODO Deprecated + """ scores = [] for i in range(len(frag_start_idxes)): scores.append( diff --git a/alpharaw/match/psm_match_alphatims.py b/alpharaw/match/psm_match_alphatims.py index 876ba69..d2affc4 100644 --- a/alpharaw/match/psm_match_alphatims.py +++ b/alpharaw/match/psm_match_alphatims.py @@ -1,3 +1,4 @@ +# TODO to be remove as already implemented in alphaDIA. from typing import Tuple, Union import numpy as np @@ -6,9 +7,6 @@ from alphatims.bruker import TimsTOF from alpharaw.ms_data_base import MSData_Base, ms_reader_provider -from alpharaw.wrappers.alphapept_wrapper import ( - AlphaPept_HDF_MS2_Reader, # noqa: F401 # TODO remove import side effect -) from alpharaw.wrappers.alphatims_wrapper import AlphaTimsWrapper from .psm_match import PepSpecMatch @@ -231,7 +229,6 @@ def match_ms2_one_raw( np.inf if a fragment is not matched. """ - self._preprocess_psms(psm_df_one_raw) self.psm_df = psm_df_one_raw psm_df_one_raw = self._add_missing_columns_to_psm_df(psm_df_one_raw) From d10fa5e52b31f7b9caa01806722363501a22fcda Mon Sep 17 00:00:00 2001 From: jalew188 Date: Mon, 17 Jun 2024 12:41:28 +0200 Subject: [PATCH 2/4] #53 FIX docs for psm_match.py --- alpharaw/match/psm_match.py | 113 ++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 31 deletions(-) diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py index d544389..814dfd2 100644 --- a/alpharaw/match/psm_match.py +++ b/alpharaw/match/psm_match.py @@ -30,25 +30,8 @@ class PepSpecMatch: """ Extract fragment ions from MS2 data. - - Parameters - ---------- - charged_frag_types : list, optional - fragment types with charge states, - e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2']. - If None, it is `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)`. - By default None. - - match_closest : bool, optional - if True, match the closest peak for a m/z; - if False, matched the higest peak for a m/z in the tolerance range. - By default True. - - use_ppm : bool, optional - If use ppm, by default True. - - tol_value : float, optional - tolerance value, by default 20.0 + The extracted information can be used for visualization of peak annotation or + PeptDeep transfer learnining for the MS2 model. """ match_closest: bool = True @@ -64,6 +47,26 @@ def __init__( use_ppm: bool = True, tol_value: float = 20.0, ): + """ + Parameters + ---------- + charged_frag_types : list, optional + fragment types with charge states, + e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2']. + Defaults to `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)`. + + match_closest : bool, optional + if True, match the closest peak for a m/z; + if False, matched the higest peak for a m/z in the tolerance range. + By default True. + + use_ppm : bool, optional + If use ppm other wise Da, by default True. + + tol_value : float, optional + Matching tolerance value (ppm or Da based on `use_ppm`) + for peak annotation, by default 20.0 + """ self.charged_frag_types = ( get_charged_frag_types(["b", "y", "b_modloss", "y_modloss"], 2) if charged_frag_types is None @@ -82,7 +85,7 @@ def get_fragment_mz_df(self) -> pd.DataFrame: Returns ------- DataFrame - _description_ + The fragment m/z dataframe in alphabase format. """ return create_fragment_mz_dataframe( self.psm_df, @@ -94,7 +97,7 @@ def _add_missing_columns_to_psm_df( self, psm_df: pd.DataFrame, raw_data: MSData_Base = None ): """ - Add missing "rt", "nce", "rt_norm", ("mobility") columns to `psm_df` if missing. + Add missing "rt", "nce", "rt_norm", ("mobility") columns to `psm_df` inplace if missing. Parameters ---------- @@ -106,7 +109,7 @@ def _add_missing_columns_to_psm_df( Returns ------- DataFrame - psm_df inplace. + The original `psm_df` with missing columns added. """ if raw_data is None: raw_data = self.raw_data @@ -179,7 +182,7 @@ def load_ms_data( Parameters ---------- ms_file : str | MSData_Base - ms2 file path. + Absolute or relative path of the ms2 file. ms_file_type : str, optional ms2 file type, could be @@ -194,7 +197,7 @@ def get_peaks(self, spec_idx: int, **kwargs): def _match_one_psm( self, peak_mzs: np.ndarray, - peak_intens: np.ndarray, + peak_intensities: np.ndarray, fragment_mz_df: pd.DataFrame, matched_intensity_df: pd.DataFrame, matched_mz_err_df: pd.DataFrame, @@ -209,7 +212,7 @@ def _match_one_psm( ---------- peak_mzs : np.ndarray Peak m/z values to be matched. - peak_intens : np.ndarray + peak_intensities : np.ndarray Peak intensities to be matched. fragment_mz_df : pd.DataFrame fragment m/z dataframe to be matched. @@ -236,17 +239,17 @@ def _match_one_psm( if self.match_closest: matched_idxes = match_closest_peaks( - peak_mzs, peak_intens, frag_mzs, mz_tols + peak_mzs, peak_intensities, frag_mzs, mz_tols ) else: matched_idxes = match_highest_peaks( peak_mzs, - peak_intens, + peak_intensities, frag_mzs, mz_tols, ) - matched_intens = peak_intens[matched_idxes] + matched_intens = peak_intensities[matched_idxes] matched_intens[matched_idxes == -1] = 0 matched_mz_errs = np.abs(peak_mzs[matched_idxes] - frag_mzs) @@ -442,6 +445,9 @@ def match_ms2_multi_raw( class PepSpecMatch_DIA(PepSpecMatch): + """ + Peak annotation for DIA data. + """ max_spec_per_query: int = 3 min_frag_mz: float = 200.0 @@ -483,6 +489,22 @@ def _prepare_matching_dfs(self): def _match_ms2_one_raw_numba( self, raw_name: str, psm_df_one_raw: pd.DataFrame ) -> pd.DataFrame: + """ + Internal method to extract peak information with numba as backend. + + Parameters + ---------- + raw_name : str + The raw name of the raw file. `psm_df_one_raw` dataframe should also + contain the same raw name in `raw_name` column. + psm_df_one_raw : pd.DataFrame + The dataframe for PSMs. + + Returns + ------- + pd.DataFrame + `psm_df_one_raw` + """ psm_df_one_raw = psm_df_one_raw.reset_index(drop=True) if raw_name in self._ms_file_dict: @@ -542,7 +564,36 @@ def match_ms2_multi_raw( ms_files: Tuple[dict, list], ms_file_type: str = "alpharaw_hdf", process_num: int = 8, - ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Match peaks for the given `psm_df` against the corresponding MS spectrum files. + + Parameters + ---------- + psm_df : pd.DataFrame + Peptide-spectrum matches in alphabase dataframe format. + ms_files : Tuple[dict, list] + The absolute or relative paths of MS files. + if the type is `dict`, the format will be + `{'raw_name1': 'raw_name1.raw', ...}` if `ms_file_type` is `thermo_raw`. + ms_file_type : str, optional + MS file type that is already registered in + :obj:`alpharaw.ms_data_base.ms_reader_provider`. + By default "alpharaw_hdf". + process_num : int, optional + Match peaks by using multiprocessing, by default 8 + + Returns + ------- + Tuple + pd.DataFrame: the `psm_df`. + + pd.DataFrame: fragment m/z dataframe in alphabase format. + + pd.DataFrame: the matched fragment intensity dataframe in alphabase format. + + pd.DataFrame: the matched mass error in the same dataframe format. + """ if isinstance(ms_files, list): ms_files = parse_ms_files_to_dict(ms_files) psm_df = psm_df[psm_df.raw_name.isin(ms_files)].reset_index(drop=True) @@ -570,7 +621,7 @@ def match_one_raw_with_numba( matched_intensities: np.ndarray, matched_mz_errs: np.ndarray, match_closest: bool = True, -): +)->None: """ Internel function to match fragment mz values to spectrum mz values. Matched_mz_errs[i] = np.inf if no peaks are matched. @@ -625,7 +676,7 @@ def load_ms_data( process_count: int = 8, ) -> MSData_Base: """ - Load MS file. + Load MS file and get `MSData_Base` object. Parameters ---------- From d243b641820e0c7683f9a63b11160bd4dead4017 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Mon, 17 Jun 2024 12:42:50 +0200 Subject: [PATCH 3/4] #53 FIX pre-commit --- alpharaw/match/psm_match.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py index 814dfd2..e058f27 100644 --- a/alpharaw/match/psm_match.py +++ b/alpharaw/match/psm_match.py @@ -448,6 +448,7 @@ class PepSpecMatch_DIA(PepSpecMatch): """ Peak annotation for DIA data. """ + max_spec_per_query: int = 3 min_frag_mz: float = 200.0 @@ -621,7 +622,7 @@ def match_one_raw_with_numba( matched_intensities: np.ndarray, matched_mz_errs: np.ndarray, match_closest: bool = True, -)->None: +) -> None: """ Internel function to match fragment mz values to spectrum mz values. Matched_mz_errs[i] = np.inf if no peaks are matched. From 5a9ce6be54e76f3aca02ffe00c71d2404901c962 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 18 Jun 2024 13:10:50 +0200 Subject: [PATCH 4/4] #53 FIX add more docs in psm_match.py --- alpharaw/match/psm_match.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py index e058f27..0f5572c 100644 --- a/alpharaw/match/psm_match.py +++ b/alpharaw/match/psm_match.py @@ -146,7 +146,10 @@ def _add_missing_columns_to_psm_df( def _prepare_matching_dfs(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ - Prepare dataframes to be matched. + Prepare empty `fragment_mz_df`, `matched_intensity_df`, + and `matched_mz_err_df` dataframes to extract peak matching information + for `self.psm_df`. These three dataframes will be only used internally + in :class:`PepSpecMatch` objects. Returns -------