Skip to content

Commit

Permalink
merging last version of development to xl/feature and fix conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
mostafakalhor committed Oct 15, 2024
1 parent 7a3a676 commit cad7c14
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 25 deletions.
43 changes: 29 additions & 14 deletions oktoberfest/predict/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,31 @@
logger = logging.getLogger(__name__)


def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_charge: bool = False) -> Spectra:
def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_charge: bool = False, xl: bool = False) -> Spectra:
"""
Prepare an alignment DataFrame from the given Spectra library.
This function creates an alignment DataFrame by removing decoys and non-HCD-fragmented spectra
from the input library, selecting the top 1000 (or however many are available if <1000) highest-scoring spectra, and
repeating the DataFrame for each collision energy (CE) in the given range.
This function creates an alignment DataFrame by removing decoy and HCD fragmented spectra
from the input library, selecting the top 1000 highest-scoring spectra for linear and top 20s for cross-linked peptides
and repeating the DataFrame for each collision energy (CE) in the given range.
:param library: the library to be propagated
:param ce_range: the min and max CE to be propagated for alignment in the dataframe
:param group_by_charge: if true, select the top 1000 spectra independently for each precursor charge
:param xl: if true, select the top 50 spectra for cross-linked peptide
:return: a library that is modified according to the description above
"""
top_n = 1000
top_n = 1000 if not xl else 20

if group_by_charge:
groups = ["RAW_FILE", "PRECURSOR_CHARGE"]
else:
groups = ["RAW_FILE"]

hcd_targets = library.obs.query("(FRAGMENTATION == 'HCD') & ~REVERSE")
hcd_targets = hcd_targets.sort_values(by="SCORE", ascending=False).groupby(groups)
hcd_targets = hcd_targets.sort_values(by="SCORE", ascending=False).groupby(groups
top_hcd_targets = hcd_targets.head(top_n)

alignment_library = library[top_hcd_targets.index]
alignment_library = Spectra(
anndata.concat([alignment_library for _ in range(*ce_range)], index_unique="_", keys=range(*ce_range))
Expand All @@ -41,14 +42,14 @@ def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_
alignment_library.obs.reset_index(inplace=True)

alignment_library.obs["ORIG_COLLISION_ENERGY"] = alignment_library.obs["COLLISION_ENERGY"]
alignment_library.obs["COLLISION_ENERGY"] = np.repeat(range(*ce_range), len(top_hcd_targets))

alignment_library.obs["COLLISION_ENERGY"] = np.repeat(range(*ce_range), len(top_hcd_targets)
# alignment_library.uns["ion_types"] = library.uns["ion_types"]

return alignment_library


def _alignment(alignment_library: Spectra):
def _alignment(alignment_library: Spectra, xl: bool = False):
"""
Perform the alignment of predicted versus raw intensities.
Expand All @@ -57,7 +58,21 @@ def _alignment(alignment_library: Spectra):
:param alignment_library: the library to perform the alignment on
"""
pred_intensity = alignment_library.get_matrix(FragmentType.PRED)
raw_intensity = alignment_library.get_matrix(FragmentType.RAW)
sm = SimilarityMetrics(pred_intensity, raw_intensity)
alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")
if xl:
pred_intensity_a = alignment_library.get_matrix(FragmentType.PRED_A)
pred_intensity_b = alignment_library.get_matrix(FragmentType.PRED_B)
raw_intensity_a = alignment_library.get_matrix(FragmentType.RAW_A)
raw_intensity_b = alignment_library.get_matrix(FragmentType.RAW_B)
sm_a = SimilarityMetrics(pred_intensity_a, raw_intensity_a)
sm_b = SimilarityMetrics(pred_intensity_b, raw_intensity_b)
alignment_library.add_column(sm_a.spectral_angle(raw_intensity_a, pred_intensity_a, 0), "SPECTRAL_ANGLE_A")
alignment_library.add_column(sm_b.spectral_angle(raw_intensity_b, pred_intensity_b, 0), "SPECTRAL_ANGLE_B")
alignment_library.add_column(
(alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2,
"SPECTRAL_ANGLE",
)
else:
pred_intensity = alignment_library.get_matrix(FragmentType.PRED)
raw_intensity = alignment_library.get_matrix(FragmentType.RAW)
sm = SimilarityMetrics(pred_intensity, raw_intensity)
alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")
31 changes: 20 additions & 11 deletions oktoberfest/predict/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def _filter_kwargs(self, **kwargs) -> dict[str, Any]:
signature = inspect.signature(self._predictor.predict)
return {key: value for key, value in kwargs.items() if key in signature.parameters}

def predict_intensities(self, data: Spectra, chunk_idx: Optional[list[pd.Index]] = None, **kwargs):
def predict_intensities(self, data: Spectra, xl: bool = False, chunk_idx: Optional[list[pd.Index]] = None, **kwargs):
"""
Generate intensity predictions and add them to the provided data object.
Expand Down Expand Up @@ -149,10 +149,10 @@ def predict_intensities(self, data: Spectra, chunk_idx: Optional[list[pd.Index]]
>>> print(library.layers["pred_int"])
"""
if chunk_idx is None:
intensities = self.predict_at_once(data=data, **kwargs)
intensities = self.predict_at_once(data=data, xl=xl **kwargs)
data.add_intensities(intensities["intensities"], intensities["annotation"], fragment_type=FragmentType.PRED)
else:
chunked_intensities = self.predict_in_chunks(data=data, chunk_idx=chunk_idx, **kwargs)
chunked_intensities = self.predict_in_chunks(data=data, chunk_idx=chunk_idx, xl=xl, **kwargs)
data.add_list_of_predicted_intensities(
chunked_intensities["intensities"], chunked_intensities["annotation"], chunk_idx
)
Expand Down Expand Up @@ -193,7 +193,7 @@ def predict_rt(self, data: Spectra, **kwargs):
pred_irts = self.predict_at_once(data=data, **kwargs)
data.add_column(pred_irts["irt"].squeeze(), name="PREDICTED_IRT")

def predict_at_once(self, data: Spectra, **kwargs) -> dict[str, np.ndarray]:
def predict_at_once(self, data: Spectra, xl: bool = False, **kwargs) -> dict[str, np.ndarray]:
"""
Retrieve and return predictions in one go.
Expand Down Expand Up @@ -227,7 +227,12 @@ def predict_at_once(self, data: Spectra, **kwargs) -> dict[str, np.ndarray]:
>>> predictions = intensity_predictor.predict_at_once(data=library)
>>> print(predictions)
"""
return self._predictor.predict(data, **self._filter_kwargs(**kwargs))
if xl:
return self._predictor.predict_xl(data, **self._filter_kwargs(**kwargs))
else:
return self._predictor.predict(data, **self._filter_kwargs(**kwargs))



def _predict_at_once_df(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray]:
"""
Expand Down Expand Up @@ -265,7 +270,7 @@ def _predict_at_once_df(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndar
"""
return self._predictor.predict(data, **self._filter_kwargs(**kwargs))

def predict_in_chunks(self, data: Spectra, chunk_idx: list[pd.Index], **kwargs) -> dict[str, list[np.ndarray]]:
def predict_in_chunks(self, data: Spectra, chunk_idx: list[pd.Index], xl: bool = False, **kwargs) -> dict[str, list[np.ndarray]]:
"""
Retrieve and return predictions in chunks.
Expand Down Expand Up @@ -308,11 +313,14 @@ def predict_in_chunks(self, data: Spectra, chunk_idx: list[pd.Index], **kwargs)
"""
results = []
for idx in chunk_idx:
results.append(self._predictor.predict(data[idx], **self._filter_kwargs(**kwargs)))
if xl:
results.append(self._predictor.predict_xl(data[idx], **self._filter_kwargs(**kwargs)))
else:
results.append(self._predictor.predict(data[idx], **self._filter_kwargs(**kwargs)))
ret_val = {key: [item[key] for item in results] for key in results[0].keys()}
return ret_val

def ce_calibration(self, library: Spectra, ce_range: tuple[int, int], group_by_charge: bool, **kwargs) -> Spectra:
def ce_calibration(self, library: Spectra, ce_range: tuple[int, int], group_by_charge: bool, xl: bool = False, **kwargs) -> Spectra:
"""
Calculate best collision energy for peptide property predictions.
Expand Down Expand Up @@ -356,12 +364,13 @@ def ce_calibration(self, library: Spectra, ce_range: tuple[int, int], group_by_c
>>> alignment_library = intensity_predictor.ce_calibration(library=library, ce_range=(15,30), group_by_charge=False)
>>> print(alignment_library)
"""
alignment_library = _prepare_alignment_df(library, ce_range=ce_range, group_by_charge=group_by_charge)

alignment_library = _prepare_alignment_df(library, ce_range=ce_range, group_by_charge=group_by_charge, xl=xl)

if "alphapept" in self.model_name.lower():
chunk_idx = list(group_iterator(df=alignment_library.obs, group_by_column="PEPTIDE_LENGTH"))
else:
chunk_idx = None
self.predict_intensities(data=alignment_library, chunk_idx=chunk_idx, keep_dataset=False, **kwargs)
_alignment(alignment_library)
self.predict_intensities(data=alignment_library, chunk_idx=chunk_idx, keep_dataset=False, xl=xl **kwargs)
_alignment(alignment_library, xl=xl)
return alignment_library

0 comments on commit cad7c14

Please sign in to comment.