Skip to content

Commit

Permalink
fixing conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
mostafakalhor committed Oct 16, 2024
1 parent cad7c14 commit 97c00f0
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 63 deletions.
30 changes: 29 additions & 1 deletion oktoberfest/data/spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,14 +230,42 @@ def add_intensities(self, intensities: np.ndarray, annotation: np.ndarray, fragm
intensities[intensities == 0] = c.EPSILON
intensities[intensities == -1] = 0.0

annotation_cleaned = np.array([s.decode('utf-8') if isinstance(s, bytes) else str(s) for s in annotation[0]])
annotation_cleaned = np.where(annotation_cleaned == 'None', 'no_fragment', annotation_cleaned)
annotation_cleaned = np.where(annotation_cleaned == None, 'no_fragment', annotation_cleaned)


annotation_to_index = {annot: index for index, annot in enumerate(self.var_names)}
col_index = np.vectorize(annotation_to_index.get)(annotation[0].astype(str))
print(annotation_to_index)
print("Original Annotation:", annotation[0])
print("Cleaned Annotation:", annotation_cleaned)

col_index = np.vectorize(annotation_to_index.get)(annotation_cleaned.astype(str))


sparse_intensity_matrix = dok_matrix(self.shape)
sparse_intensity_matrix[:, col_index] = intensities

layer = self._resolve_layer_name(fragment_type)
self.layers[layer] = csr_matrix(sparse_intensity_matrix)

def add_intensities_without_mapping(self, intensities: np.ndarray, fragment_type: FragmentType):
"""
Add predicted intensities and convert to sparse matrix.
This function takes a numpy array, containing intensities.
The intensitz arraz is aexpected to have the same shape as this object and will be added to
the respective lazer without checking the order of fragment annotations.
:param intensities: intensity numpy array to add with shapes (n x m)
:param fragment_type: the type of intensities to add. Can be FragmentType.RAW or FragmentType.PRED.
"""
intensities[intensities == 0] = c.EPSILON
intensities[intensities == -1] = 0.0

layer = self._resolve_layer_name(fragment_type)
self.layers[layer] = csr_matrix(intensities)

def add_list_of_predicted_intensities(
self,
intensities: list[np.ndarray],
Expand Down
4 changes: 2 additions & 2 deletions oktoberfest/predict/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_
groups = ["RAW_FILE"]

hcd_targets = library.obs.query("(FRAGMENTATION == 'HCD') & ~REVERSE")
hcd_targets = hcd_targets.sort_values(by="SCORE", ascending=False).groupby(groups
hcd_targets = hcd_targets.sort_values(by="SCORE", ascending=False).groupby(groups)
top_hcd_targets = hcd_targets.head(top_n)

alignment_library = library[top_hcd_targets.index]
Expand All @@ -42,7 +42,7 @@ def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_
alignment_library.obs.reset_index(inplace=True)

alignment_library.obs["ORIG_COLLISION_ENERGY"] = alignment_library.obs["COLLISION_ENERGY"]
alignment_library.obs["COLLISION_ENERGY"] = np.repeat(range(*ce_range), len(top_hcd_targets)
alignment_library.obs["COLLISION_ENERGY"] = np.repeat(range(*ce_range), len(top_hcd_targets))

# alignment_library.uns["ion_types"] = library.uns["ion_types"]

Expand Down
48 changes: 19 additions & 29 deletions oktoberfest/predict/koina.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from __future__ import annotations

import copy
import logging
from typing import TYPE_CHECKING

Expand All @@ -12,6 +12,7 @@

if TYPE_CHECKING:
import numpy as np
from typing import Tuple, Dict


alternative_column_map = {
Expand Down Expand Up @@ -81,17 +82,9 @@ def predict(self, data: dict[str, np.ndarray] | pd.DataFrame | Spectra, **kwargs
input_field: data[[alternative_column_map[input_field]]].to_numpy()
for input_field in self.model_inputs.keys()
}
if _async:
return self.__predict_async(data, debug=debug)
else:
return self.__predict_sequential(data)

def predict_xl(
self,
data: Union[Dict[str, np.ndarray], pd.DataFrame],
_async: bool = True,
debug=False,
) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
return super().predict(inputs=data, **kwargs)

def predict_xl(self, data: dict[str, np.ndarray] | pd.DataFrame | Spectra, **kwargs) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
"""
Perform inference on the xl data using the Koina model.
Expand All @@ -104,13 +97,11 @@ def predict_xl(
:param data: A dictionary or dataframe containing input data for inference. For the dictionary, keys are input names,
and values are numpy arrays. In case of a dataframe, the input fields for the requested model must be present
in the column names.
:param _async: If True, perform asynchronous inference; if False, perform sequential inference. Defaults to True.
:param debug: If True and using _async mode, store raw InferResult / InferServerException dictionary for later analysis.
:param kwargs: Additional params that are forwarded to super().predict
:return: A dictionary containing the model's predictions. Keys are output names, and values are numpy arrays
representing the model's output.
:return: TODO
Example::
Example:: TODO
model = Koina("Prosit_2019_intensity")
input_data = {
"peptide_sequences": np.array(["PEPTIDEK" for _ in range(size)]),
Expand All @@ -121,18 +112,17 @@ def predict_xl(
}
predictions = model.predict(input_data)
"""
if isinstance(data, Spectra):
data = data.obs
if isinstance(data, pd.DataFrame):
data_1 = {
input_field: data[alternative_column_map_xl[input_field]].to_numpy()
for input_field in self.model_inputs.keys()
}
data_2 = {
input_field: data[alternative_column_map_xl_switched[input_field]].to_numpy()
data = {
input_field: data[[alternative_column_map_xl[input_field]]].to_numpy()
for input_field in self.model_inputs.keys()
}
if _async:
return self.__predict_async(data_1, debug=debug), self.__predict_async(data_2, debug=debug)
else:
return self.__predict_sequential(data_1), self.__predict_sequential(data_2)


prediction_ab = super().predict(inputs=data, debug = True, **kwargs)
temp_field = data["peptide_sequences_1"].copy()
data["peptide_sequences_1"] = data["peptide_sequences_2"]
data["peptide_sequences_2"] = temp_field
prediction_ba = super().predict(inputs=data, debug = True, **kwargs)

return prediction_ab, prediction_ba
28 changes: 23 additions & 5 deletions oktoberfest/predict/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,31 @@ def predict_intensities(self, data: Spectra, xl: bool = False, chunk_idx: Option
>>> print(library.layers["pred_int"])
"""
if chunk_idx is None:
intensities = self.predict_at_once(data=data, xl=xl **kwargs)
data.add_intensities(intensities["intensities"], intensities["annotation"], fragment_type=FragmentType.PRED)
if xl:
intensities_a, intensities_b = self.predict_at_once(data=data, xl=xl, **kwargs)
#print(intensities_a["annotation"][0])
data.add_intensities_without_mapping(intensities_a["intensities"], fragment_type=FragmentType.PRED_A)
data.add_intensities_without_mapping(intensities_b["intensities"], fragment_type=FragmentType.PRED_B)
else:
intensities = self.predict_at_once(data=data, xl=xl, **kwargs)
data.add_intensities(intensities["intensities"], intensities["annotation"], fragment_type=FragmentType.PRED)

else:
chunked_intensities = self.predict_in_chunks(data=data, chunk_idx=chunk_idx, xl=xl, **kwargs)
data.add_list_of_predicted_intensities(
if xl:
chunked_intensities_a, chunked_intensities_b = self.predict_in_chunks(data=data, chunk_idx=chunk_idx, xl=xl, **kwargs)
data.add_list_of_predicted_intensities(
chunked_intensities_a["intensities"], chunked_intensities_a["annotation"], chunk_idx
)
data.add_list_of_predicted_intensities(
chunked_intensities_b["intensities"], chunked_intensities_b["annotation"], chunk_idx
)
else:
chunked_intensities = self.predict_in_chunks(data=data, chunk_idx=chunk_idx, xl=xl, **kwargs)
data.add_list_of_predicted_intensities(
chunked_intensities["intensities"], chunked_intensities["annotation"], chunk_idx
)


def predict_rt(self, data: Spectra, **kwargs):
"""
Generate retention time predictions and add them to the provided data object.
Expand Down Expand Up @@ -371,6 +388,7 @@ def ce_calibration(self, library: Spectra, ce_range: tuple[int, int], group_by_c
chunk_idx = list(group_iterator(df=alignment_library.obs, group_by_column="PEPTIDE_LENGTH"))
else:
chunk_idx = None
self.predict_intensities(data=alignment_library, chunk_idx=chunk_idx, keep_dataset=False, xl=xl **kwargs)
print("XL!!!!!!!!!!!!!!!!!!!!")
self.predict_intensities(data=alignment_library, chunk_idx=chunk_idx, keep_dataset=False, xl=xl, **kwargs)
_alignment(alignment_library, xl=xl)
return alignment_library
20 changes: 16 additions & 4 deletions oktoberfest/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,18 +531,25 @@ def convert_search(
search_result: Any
if search_engine == "maxquant":
search_result = MaxQuant
xl = False
elif search_engine == "msfragger":
search_result = MSFragger
xl = False
elif search_engine == "mascot":
search_result = Mascot
xl = False
elif search_engine == "sage":
search_result = Sage
xl = False
elif search_engine == "xisearch":
search_result = Xisearch
xl = True
elif search_engine == "scout":
search_result = Scout
xl = True
elif search_engine == "msamanda":
search_result = MSAmanda
xl = False
else:
raise ValueError(f"Unknown search engine provided: {search_engine}")

Expand All @@ -552,6 +559,7 @@ def convert_search(
custom_mods=custom_mods,
ptm_unimod_id=ptm_unimod_id,
ptm_sites=ptm_sites,
xl=xl,
)


Expand Down Expand Up @@ -978,10 +986,14 @@ def annotate_spectral_library_xl(psms: Spectra, mass_tol: Optional[float] = None
logger.info("Annotating spectra...")
df_annotated_spectra = annotate_spectra(psms, mass_tol, unit_mass_tol)
aspec = Spectra(obs=psms.drop(columns=["INTENSITIES", "MZ"]), var=Spectra._gen_vars_df(xl=True))
aspec.add_matrix(np.stack(df_annotated_spectra["INTENSITIES_A"]), FragmentType.RAW_A)
aspec.add_matrix(np.stack(df_annotated_spectra["INTENSITIES_B"]), FragmentType.RAW_B)
aspec.add_matrix(np.stack(df_annotated_spectra["MZ_A"]), FragmentType.MZ_A)
aspec.add_matrix(np.stack(df_annotated_spectra["MZ_B"]), FragmentType.MZ_B)
aspec.add_intensities(
np.stack(df_annotated_spectra["INTENSITIES_A"]), aspec.var_names.values[None, ...], FragmentType.RAW_A
)
aspec.add_intensities(
np.stack(df_annotated_spectra["INTENSITIES_B"]), aspec.var_names.values[None, ...], FragmentType.RAW_B
)
aspec.add_mzs(np.stack(df_annotated_spectra["MZ_A"]), FragmentType.MZ_A)
aspec.add_mzs(np.stack(df_annotated_spectra["MZ_B"]), FragmentType.MZ_B)
aspec.add_column(df_annotated_spectra["CALCULATED_MASS_A"].values, "CALCULATED_MASS_A")
aspec.add_column(df_annotated_spectra["CALCULATED_MASS_B"].values, "CALCULATED_MASS_B")

Expand Down
2 changes: 1 addition & 1 deletion oktoberfest/rescore/rescore.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def generate_features(
if xl:
pred_a = library.get_matrix(FragmentType.PRED_A)
pred_b = library.get_matrix(FragmentType.PRED_B)
raw_a = library.get_matrix(FragmentType.RAW_A)[
raw_a = library.get_matrix(FragmentType.RAW_A)
raw_b = library.get_matrix(FragmentType.RAW_B)
mz_a = library.get_matrix(FragmentType.MZ_A)
mz_b = library.get_matrix(FragmentType.MZ_B)
Expand Down
32 changes: 11 additions & 21 deletions oktoberfest/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,18 +183,9 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config):


if "xl" in config.models["intensity"].lower():
alignment_library = pr.ce_calibration(library, config.ce_range, use_ransac_model, xl=True, dataset_name=spectra_file.stem + "_ce_calibration",)
alignment_library = predictor.ce_calibration(library, config.ce_range, use_ransac_model, xl=True, dataset_name=spectra_file.stem + "_ce_calibration",)
else:
alignment_library = pr.ce_calibration(library, config.ce_range, use_ransac_model, dataset_name=spectra_file.stem + "_ce_calibration",)


predictor = pr.Predictor.from_config(config, model_type="intensity")
alignment_library = predictor.ce_calibration(
library,
config.ce_range,
use_ransac_model,
dataset_name=spectra_file.stem + "_ce_calibration",
)
alignment_library = predictor.ce_calibration(library, config.ce_range, use_ransac_model, dataset_name=spectra_file.stem + "_ce_calibration",)

if use_ransac_model:
logger.info("Performing RANSAC regression")
Expand Down Expand Up @@ -602,19 +593,19 @@ def _calculate_features(spectra_file: Path, config: Config, xl: bool = False):
)
else:
intensity_predictor = pr.Predictor.from_config(config, model_type="intensity")

if xl:
print("XL!!!!!!!!!!!!!!!!!!!!CACLFEATURE")
intensity_predictor.predict_intensities(
data=library, xl=True, chunk_idx=chunk_idx, dataset_name=spectra_file.stem, keep_dataset=False
)
data=library, xl=True, chunk_idx=chunk_idx, dataset_name=spectra_file.stem, keep_dataset=False
)

library.write_as_hdf5(config.output / "data" / spectra_file.with_suffix(".mzml.pred.hdf5").name)
predict_step.mark_done()

else:
intensity_predictor.predict_intensities(
data=library, chunk_idx=chunk_idx, dataset_name=spectra_file.stem, keep_dataset=False
)
data=library, chunk_idx=chunk_idx, dataset_name=spectra_file.stem, keep_dataset=False
)

irt_predictor = pr.Predictor.from_config(config, model_type="irt")
irt_predictor.predict_rt(data=library)
Expand Down Expand Up @@ -668,15 +659,12 @@ def _rescore(fdr_dir: Path, config: Config, xl : bool = False):
re.rescore_with_percolator(input_file=fdr_dir / "original.tab", output_folder=fdr_dir, xl = xl)
rescore_original_step.mark_done()
if not rescore_prosit_step.is_done():
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir, xl = xl)
rescore_prosit_step.mark_done()
logger.info("Start percolator rescoring")
logger.info(config.ptm_localization)
if config.ptm_localization:
_ptm_localization_rescore(fdr_dir, config)
else:
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
rescore_prosit_step.mark_done()
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir, xl=xl)
rescore_prosit_step.mark_done()
elif config.fdr_estimation_method == "mokapot":
if not rescore_original_step.is_done():
re.rescore_with_mokapot(input_file=fdr_dir / "original.tab", output_folder=fdr_dir, xl = xl)
Expand Down Expand Up @@ -1114,13 +1102,15 @@ def run_rescoring(config_path: Union[str, Path]):
processing_pool = JobPool(processes=config.num_threads)
for spectra_file in spectra_files:
if "xl" in config.models["intensity"].lower():
print("XL!!!!!!!!!!!!!!!!!!!! RUNNER")
processing_pool.apply_async(_calculate_features, [spectra_file, config], xl=True)
else:
processing_pool.apply_async(_calculate_features, [spectra_file, config])
processing_pool.check_pool()
else:
for spectra_file in spectra_files:
if "xl" in config.models["intensity"].lower():
print("XL!!!!!!!!!!!!!!!!!!!!")
_calculate_features(spectra_file, config, xl=True)
else:
_calculate_features(spectra_file, config)
Expand Down

0 comments on commit 97c00f0

Please sign in to comment.