Skip to content

Commit

Permalink
cce is working based on pep a and b
Browse files Browse the repository at this point in the history
  • Loading branch information
Mostafa Kalhor committed Jul 28, 2023
1 parent f1c3562 commit 58792e3
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 48 deletions.
83 changes: 61 additions & 22 deletions oktoberfest/ce_calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,23 @@ def merge_mzml_and_msms(self, df_search: pd.DataFrame):
df_annotated_spectra = annotate_spectra(df_join)
df_join.drop(columns=["INTENSITIES", "MZ"], inplace=True)
logger.info("Preparing library")
self.library.add_columns(df_join)
self.library.add_matrix(df_annotated_spectra["INTENSITIES"], FragmentType.RAW)
self.library.add_matrix(df_annotated_spectra["MZ"], FragmentType.MZ)
self.library.add_column(df_annotated_spectra["CALCULATED_MASS"], "CALCULATED_MASS")
if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]):
self.library.add_columns(df_join)
self.library.add_matrix(df_annotated_spectra["INTENSITIES_A"], FragmentType.RAW_A)
#print("FragmentType.RAW_A")
self.library.add_matrix(df_annotated_spectra["INTENSITIES_B"], FragmentType.RAW_B)
#print("FragmentType.RAW_B")
self.library.add_matrix(df_annotated_spectra["MZ_A"], FragmentType.MZ_A)
#print("FragmentType.MZ_A")
self.library.add_matrix(df_annotated_spectra["MZ_B"], FragmentType.MZ_B)
#print("FragmentType.MZ_B")
self.library.add_column(df_annotated_spectra["CALCULATED_MASS_A"], "CALCULATED_MASS_A")
self.library.add_column(df_annotated_spectra["CALCULATED_MASS_B"], "CALCULATED_MASS_B")
else:
self.library.add_columns(df_join)
self.library.add_matrix(df_annotated_spectra["INTENSITIES"], FragmentType.RAW)
self.library.add_matrix(df_annotated_spectra["MZ"], FragmentType.MZ)
self.library.add_column(df_annotated_spectra["CALCULATED_MASS"], "CALCULATED_MASS")


def get_mzml_path(self) -> Path:
Expand All @@ -150,11 +163,16 @@ def _prepare_alignment_df(self):
(self.alignment_library.spectra_data["FRAGMENTATION"] == "HCD")
& (~self.alignment_library.spectra_data["REVERSE"])
]
# Select the 1000 highest scoring or all if there are less than 1000
self.alignment_library.spectra_data = self.alignment_library.spectra_data.sort_values(
# Select the 1000 highest scoring or all if there are less than 1000
# Select the 500 highest scoring for crosslinked peptides
if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]):
self.alignment_library.spectra_data = self.alignment_library.spectra_data.sort_values(
by="SCORE", ascending=False
).iloc[:120]

).iloc[:1000]
else:
self.alignment_library.spectra_data = self.alignment_library.spectra_data.sort_values(
by="SCORE", ascending=False
).iloc[:1000]
# Repeat dataframe for each CE
ce_range = range(18, 50)
nrow = len(self.alignment_library.spectra_data)
Expand All @@ -168,26 +186,47 @@ def _alignment(self):
Check https://gitlab.lrz.de/proteomics/prosit_tools/oktoberfest/-/blob/develop/oktoberfest/ce_calibration/grpc_alignment.py
"""
pred_intensity = self.alignment_library.get_matrix(FragmentType.PRED)
raw_intensity = self.alignment_library.get_matrix(FragmentType.RAW)
# return pred_intensity.toarray(), raw_intensity.toarray()
sm = SimilarityMetrics(pred_intensity, raw_intensity)
self.alignment_library.spectra_data["SPECTRAL_ANGLE"] = sm.spectral_angle(raw_intensity, pred_intensity, 0)
self.alignment_library.spectra_data.to_csv(self.results_path / "SA.tsv", sep="\t")
self.ce_alignment = self.alignment_library.spectra_data.groupby(by=["COLLISION_ENERGY"])[
"SPECTRAL_ANGLE"
].mean()
if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]):
pred_intensity_a = self.alignment_library.get_matrix(FragmentType.PRED_A)

pred_intensity_b = self.alignment_library.get_matrix(FragmentType.PRED_B)
raw_intensity_a = self.alignment_library.get_matrix(FragmentType.RAW_A)
raw_intensity_b = self.alignment_library.get_matrix(FragmentType.RAW_B)
#print(pred_intensity_a.shape)
#print(pred_intensity_b.shape)
#print(raw_intensity_a.shape)
#print(raw_intensity_b.shape)
sm_a = SimilarityMetrics(pred_intensity_a, raw_intensity_a)
sm_b = SimilarityMetrics(pred_intensity_b, raw_intensity_b)
self.alignment_library.spectra_data["SPECTRAL_ANGLE_A"] = sm_a.spectral_angle(raw_intensity_a, pred_intensity_a, 0)
self.alignment_library.spectra_data["SPECTRAL_ANGLE_B"] = sm_a.spectral_angle(raw_intensity_b, pred_intensity_b, 0)
self.alignment_library.spectra_data["SPECTRAL_ANGLE"] = (self.alignment_library.spectra_data["SPECTRAL_ANGLE_A"]+
self.alignment_library.spectra_data["SPECTRAL_ANGLE_B"]) / 2
self.alignment_library.spectra_data.to_csv(self.results_path / "SA.tsv", sep="\t")
self.ce_alignment = self.alignment_library.spectra_data.groupby(by=["COLLISION_ENERGY"])[
"SPECTRAL_ANGLE"].mean()
else:
pred_intensity = self.alignment_library.get_matrix(FragmentType.PRED)
raw_intensity = self.alignment_library.get_matrix(FragmentType.RAW)

# return pred_intensity.toarray(), raw_intensity.toarray()
sm = SimilarityMetrics(pred_intensity, raw_intensity)
self.alignment_library.spectra_data["SPECTRAL_ANGLE"] = sm.spectral_angle(raw_intensity, pred_intensity, 0)
self.alignment_library.spectra_data.to_csv(self.results_path / "SA.tsv", sep="\t")
self.ce_alignment = self.alignment_library.spectra_data.groupby(by=["COLLISION_ENERGY"])[
"SPECTRAL_ANGLE"
].mean()
plot_mean_sa_ce(
sa_ce_df=self.ce_alignment,
filename=self.results_path / f"{self.raw_path.stem}_mean_spectral_angle_ce.svg",
best_ce=self.ce_alignment.idxmax(),
)
sa_ce_df=self.ce_alignment,
filename=self.results_path / f"{self.raw_path.stem}_mean_spectral_angle_ce.svg",
best_ce=self.ce_alignment.idxmax(),
)
plot_violin_sa_ce(
df=self.alignment_library.spectra_data[["COLLISION_ENERGY", "SPECTRAL_ANGLE"]],
filename=self.results_path / f"{self.raw_path.stem}_violin_spectral_angle_ce.svg",
best_ce=self.ce_alignment.idxmax(),
)

def perform_alignment(self, df_search: pd.DataFrame):
"""
Perform alignment and get the best CE.
Expand Down
2 changes: 1 addition & 1 deletion oktoberfest/constants_dir.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
CONFIG_PATH = "/home/mkalhor/wilhelmlab/notebooks/notebooks/config.json"
CONFIG_PATH = "/cmnfs/home/m.kalhor/wilhelmlab/notebooks/notebooks/config.json"
65 changes: 44 additions & 21 deletions oktoberfest/data/spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,38 @@
import spectrum_fundamentals.constants as c
from scipy.sparse import coo_matrix, spmatrix
from spectrum_io.file import hdf5
#from utils.config import Config


from ..utils.config import Config
from ..constants_dir import CONFIG_PATH

logger = logging.getLogger(__name__)


class FragmentType(Enum):
"""FragmentType class to enumerate pred, raw, and mz."""

PRED = 1
RAW = 2
MZ = 3
PRED = 1
PRED_A = 2
PRED_B = 3
RAW = 4
RAW_A = 5
RAW_B = 6
MZ = 7
MZ_A = 8
MZ_B = 9


class Spectra:
"""Main to init spectra data."""

INTENSITY_COLUMN_PREFIX = "INTENSITY_RAW"
INTENSITY_COLUMN_PREFIX_A = "INTENSITY_RAW_A"
INTENSITY_COLUMN_PREFIX_B = "INTENSITY_RAW_B"
INTENSITY_PRED_PREFIX = "INTENSITY_PRED"
INTENSITY_PRED_PREFIX_A = "INTENSITY_PRED_A"
INTENSITY_PRED_PREFIX_B = "INTENSITY_PRED_B"
MZ_COLUMN_PREFIX = "MZ_RAW"
MZ_COLUMN_PREFIX_A = "MZ_RAW_A"
MZ_COLUMN_PREFIX_B = "MZ_RAW_B"
EPSILON = 1e-7
COLUMNS_FRAGMENT_ION = ["Y1+", "Y1++", "Y1+++", "B1+", "B1++", "B1+++"]

Expand All @@ -49,12 +60,9 @@ def _gen_column_names(fragment_type: FragmentType) -> List[str]:
"""
prefix = Spectra._resolve_prefix(fragment_type)
columns = []
#config = Config()
#print(config.search_type)
#if config.search_type == "Plink2":

search_type = "XlinkX"
if search_type == "XlinkX":
config = Config()
config.read(CONFIG_PATH)
if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]):
max_range = 59
else:
max_range = 30
Expand All @@ -75,9 +83,21 @@ def _resolve_prefix(fragment_type: FragmentType) -> str:
if fragment_type.value == 1:
prefix = Spectra.INTENSITY_PRED_PREFIX
elif fragment_type.value == 2:
prefix = Spectra.INTENSITY_COLUMN_PREFIX
else:
prefix = Spectra.INTENSITY_PRED_PREFIX_A
elif fragment_type.value == 3:
prefix = Spectra.INTENSITY_PRED_PREFIX_B
elif fragment_type.value == 4:
prefix = Spectra.INTENSITY_COLUMN_PREFIX
elif fragment_type.value == 5:
prefix = Spectra.INTENSITY_COLUMN_PREFIX_A
elif fragment_type.value == 6:
prefix = Spectra.INTENSITY_COLUMN_PREFIX_B
elif fragment_type.value == 7:
prefix = Spectra.MZ_COLUMN_PREFIX
elif fragment_type.value == 8:
prefix = Spectra.MZ_COLUMN_PREFIX_A
else:
prefix = Spectra.MZ_COLUMN_PREFIX_B
return prefix

def add_column(self, column_data: pd.Series, name: str) -> None:
Expand Down Expand Up @@ -134,13 +154,12 @@ def add_matrix(self, intensity_data: pd.DataFrame, fragment_type: FragmentType)
intensity_df = intensity_data.explode()

# reshape based on the number of fragments
#config = Config()
search_type = "XlinkX"
#if config.search_type == "Plink2":
if search_type == "XlinkX":
config = Config()
config.read(CONFIG_PATH)
if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]):
intensity_array = intensity_df.values.astype(np.float32).reshape(-1, c.VEC_LENGTH_CMS2)
else:
intensity_array = intensity_df.values.astype(np.float32).reshape(-1, c.VEC_LENGTH)
intensity_array = intensity_df.values.astype(np.float32).reshape(-1, c.VEC_LENGTH)

# Change zeros to epislon to keep the info of invalid values
# change the -1 values to 0 (for better performance when converted to sparse representation)
Expand All @@ -150,9 +169,11 @@ def add_matrix(self, intensity_data: pd.DataFrame, fragment_type: FragmentType)
# generate column names and build dataframe from sparse matrix
intensity_df = pd.DataFrame.sparse.from_spmatrix(coo_matrix(intensity_array, dtype=np.float32))
columns = self._gen_column_names(fragment_type)
#print(columns)

intensity_df.columns = columns
self.add_columns(intensity_df)
#print(intensity_df)
#print(intensity_df.shape)

def get_columns(self, fragment_type: FragmentType, return_column_names: bool = False) -> spmatrix:
"""
Expand Down Expand Up @@ -184,6 +205,8 @@ def get_matrix(self, fragment_type: FragmentType, return_column_names: bool = Fa
if return_column_names:
return scipy.sparse.csr_matrix(self.spectra_data[columns_to_select].values), columns_to_select
# Check if conversion is low change to coo then csr from coo
#print(scipy.sparse.csr_matrix(self.spectra_data[columns_to_select].values))
#print(scipy.sparse.csr_matrix(self.spectra_data[columns_to_select].shape))
return scipy.sparse.csr_matrix(self.spectra_data[columns_to_select].values)

def write_as_hdf5(self, output_file: Union[str, Path]) -> None:
Expand Down Expand Up @@ -226,4 +249,4 @@ def read_from_hdf5(self, input_file: Union[str, Path]) -> None:
input_file = str(input_file)
self.add_columns(hdf5.read_file(input_file, hdf5.META_DATA_KEY))
self.add_matrix_from_hdf5(hdf5.read_file(input_file, f"sparse_{hdf5.INTENSITY_RAW_KEY}"), FragmentType.RAW)
self.add_matrix_from_hdf5(hdf5.read_file(input_file, f"sparse_{hdf5.MZ_RAW_KEY}"), FragmentType.MZ)
self.add_matrix_from_hdf5(hdf5.read_file(input_file, f"sparse_{hdf5.MZ_RAW_KEY}"), FragmentType.MZ)
58 changes: 54 additions & 4 deletions oktoberfest/spectral_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def grpc_predict(self, library: Spectra, alignment: bool = False):

intensity_model = self.config.models["intensity"]
if "xl" in intensity_model.lower():
intensity_input_data = {
intensity_input_data_a = {
"peptide_sequences_1": (
library.spectra_data["MODIFIED_SEQUENCE_A"].to_numpy().reshape(-1, 1).astype(np.object_),
"BYTES",
Expand All @@ -188,7 +188,57 @@ def grpc_predict(self, library: Spectra, alignment: bool = False):
library.spectra_data["PRECURSOR_CHARGE"].to_numpy().reshape(-1, 1).astype(np.int32),
"INT32",
),
}
}
intensity_input_data_b = {
"peptide_sequences_1": (
library.spectra_data["MODIFIED_SEQUENCE_B"].to_numpy().reshape(-1, 1).astype(np.object_),
"BYTES",
),
"peptide_sequences_2": (
library.spectra_data["MODIFIED_SEQUENCE_A"].to_numpy().reshape(-1, 1).astype(np.object_),
"BYTES",
),
"collision_energies": (
library.spectra_data["COLLISION_ENERGY"].to_numpy().reshape(-1, 1).astype(np.float32),
"FP32",
),
"precursor_charges": (
library.spectra_data["PRECURSOR_CHARGE"].to_numpy().reshape(-1, 1).astype(np.int32),
"INT32",
),
}
intensity_outputs_a = ["intensities", "mz", "annotation"]
intensity_outputs_b = ["intensities", "mz", "annotation"]

intensity_predictions_a = infer_predictions(
triton_client,
model=intensity_model,
input_data=intensity_input_data_a,
outputs=intensity_outputs_a,
batch_size=batch_size,
)
intensity_predictions_b = infer_predictions(
triton_client,
model=intensity_model,
input_data=intensity_input_data_b,
outputs=intensity_outputs_b,
batch_size=batch_size,
)
intensity_predictions_a["intensities"][np.where(intensity_predictions_a["intensities"] < 1e-7)] = 0.0
intensity_predictions_b["intensities"][np.where(intensity_predictions_b["intensities"] < 1e-7)] = 0.0

intensities_pred_a = pd.DataFrame()
intensities_pred_b = pd.DataFrame()
intensities_pred_a["intensity"] = intensity_predictions_a["intensities"].tolist()
intensities_pred_b["intensity"] = intensity_predictions_b["intensities"].tolist()
library.add_matrix(intensities_pred_a["intensity"], FragmentType.PRED_A)
#print("FragmentType.PRED_A")
library.add_matrix(intensities_pred_b["intensity"], FragmentType.PRED_B)
#print("FragmentType.PRED_B")
#dir(library)
if alignment:
return

else:
intensity_input_data = {
"peptide_sequences": (
Expand Down Expand Up @@ -221,7 +271,7 @@ def grpc_predict(self, library: Spectra, alignment: bool = False):
batch_size=batch_size,
)
intensity_predictions["intensities"][np.where(intensity_predictions["intensities"] < 1e-7)] = 0.0
"""

irt_model = self.config.models["irt"]
irt_input_data = {"peptide_sequences": intensity_input_data["peptide_sequences"]}
irt_outputs = ["irt"]
Expand All @@ -232,7 +282,7 @@ def grpc_predict(self, library: Spectra, alignment: bool = False):
outputs=irt_outputs,
batch_size=batch_size,
)
"""

if self.config.job_type == "SpectralLibraryGeneration":
intensity_prediction_dict = {
"intensity": intensity_predictions["intensities"],
Expand Down

0 comments on commit 58792e3

Please sign in to comment.