Skip to content

Commit

Permalink
Merge pull request #276 from wilhelm-lab/development
Browse files Browse the repository at this point in the history
Add cit pipeline to the release
  • Loading branch information
WassimG authored Oct 7, 2024
2 parents 98f689d + 0021bc3 commit 0005103
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 10 deletions.
16 changes: 14 additions & 2 deletions oktoberfest/preprocessing/preprocessing.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ def convert_search(
tmt_label: str = "",
custom_mods: Optional[dict[str, int]] = None,
output_file: Optional[Union[str, Path]] = None,
ptm_unimod_id: Optional[int] = 0,
ptm_sites: Optional[list] = None,
) -> pd.DataFrame:
r"""
Convert search results to Oktoberfest format.
Expand All @@ -414,7 +416,8 @@ def convert_search(
static and variable mods as keys. The values are the integer values of the respective unimod identifier
:param output_file: Optional path to the location where the converted search results should be written to.
If this is omitted, the results are not stored.
:param ptm_unimod_id: unimod id used for site localization
:param ptm_sites: possible sites that the ptm can exist on
:raises ValueError: if an unsupported search engine was given
:return: A dataframe containing the converted results.
Expand Down Expand Up @@ -503,7 +506,11 @@ def convert_search(
raise ValueError(f"Unknown search engine provided: {search_engine}")

return search_result(input_path).generate_internal(
tmt_label=tmt_label, out_path=output_file, custom_mods=custom_mods
tmt_label=tmt_label,
out_path=output_file,
custom_mods=custom_mods,
ptm_unimod_id=ptm_unimod_id,
ptm_sites=ptm_sites,
)


Expand Down Expand Up @@ -850,6 +857,7 @@ def annotate_spectral_library(
mass_tol: Optional[float] = None,
unit_mass_tol: Optional[str] = None,
custom_mods: Optional[dict[str, float]] = None,
annotate_neutral_loss: Optional[bool] = False,
) -> Spectra:
"""
Annotate all specified ion peaks of given PSMs (Default b and y ions).
Expand All @@ -865,6 +873,7 @@ def annotate_spectral_library(
:param unit_mass_tol: The unit in which the mass tolerance is given
:param fragmentation_method: fragmentation method that was used
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:param annotate_neutral_loss: flag to indicate whether to annotate neutral loss peaks or not
:return: Spectra object containing the annotated b and y ion peaks including metadata
Expand Down Expand Up @@ -892,6 +901,7 @@ def annotate_spectral_library(
unit_mass_tolerance=unit_mass_tol,
fragmentation_method=fragmentation_method,
custom_mods=custom_mods,
annotate_neutral_loss=annotate_neutral_loss,
)

ion_types = retrieve_ion_types(fragmentation_method)
Expand All @@ -903,6 +913,8 @@ def annotate_spectral_library(
)
aspec.add_mzs(np.stack(df_annotated_spectra["MZ"]), FragmentType.MZ)
aspec.add_column(df_annotated_spectra["CALCULATED_MASS"].values, "CALCULATED_MASS")
aspec.add_column(df_annotated_spectra["EXPECTED_NL_COUNT"].values, "EXPECTED_NL_COUNT")
aspec.add_column(df_annotated_spectra["ANNOTATED_NL_COUNT"].values, "ANNOTATED_NL_COUNT")
aspec.strings_to_categoricals()

logger.info("Finished annotating.")
Expand Down
6 changes: 6 additions & 0 deletions oktoberfest/rescore/rescore.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def generate_features(
additional_columns: str | list | None = None,
all_features: bool = False,
regression_method: str = "spline",
add_neutral_loss_features: bool = False,
remove_miss_cleavage_features: bool = False,
):
"""
Generate features to be used for percolator or mokapot target decoy separation.
Expand All @@ -38,6 +40,8 @@ def generate_features(
:param additional_columns: additional columns supplied in the search results to be used as features (either a list or "all")
:param all_features: whether to use all features or only the standard set TODO
:param regression_method: The regression method to use for iRT alignment
:param add_neutral_loss_features: Flag to indicate whether to add neutral loss features to percolator or not
:param remove_miss_cleavage_features: Flag to indicate whether to remove miss cleavage features from percolator or not
:Example:
Expand Down Expand Up @@ -87,6 +91,8 @@ def generate_features(
additional_columns=additional_columns,
all_features_flag=all_features,
regression_method=regression_method,
neutral_loss_flag=add_neutral_loss_features,
drop_miss_cleavage_flag=remove_miss_cleavage_features,
)
perc_features.calc()
perc_features.write_to_file(str(output_file))
Expand Down
62 changes: 56 additions & 6 deletions oktoberfest/runner.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,16 @@ def _preprocess(spectra_files: list[Path], config: Config) -> list[Path]:
msms_output.mkdir(exist_ok=True)
internal_search_file = msms_output / "msms.prosit"
tmt_label = config.tag

ptm_unimods = config.ptm_unimod_id
ptm_sites = config.ptm_possible_sites
search_results = pp.convert_search(
input_path=config.search_results,
search_engine=config.search_results_type,
tmt_label=tmt_label,
custom_mods=config.custom_to_unimod(),
output_file=internal_search_file,
ptm_unimod_id=ptm_unimods,
ptm_sites=ptm_sites,
)
if config.spectra_type.lower() in ["d", "hdf"]:
timstof_metadata = pp.convert_timstof_metadata(
Expand Down Expand Up @@ -149,12 +152,14 @@ def _annotate_and_get_library(spectra_file: Path, config: Config, tims_meta_file
spectra["INSTRUMENT_TYPES"] = config_instrument_type
search = pp.load_search(config.output / "msms" / spectra_file.with_suffix(".rescore").name)
library = pp.merge_spectra_and_peptides(spectra, search)
annotate_neutral_loss = config.ptm_use_neutral_loss
aspec = pp.annotate_spectral_library(
psms=library,
mass_tol=config.mass_tolerance,
unit_mass_tol=config.unit_mass_tolerance,
fragmentation_method=config.fragmentation_method,
custom_mods=config.unimod_to_mass(),
annotate_neutral_loss=annotate_neutral_loss,
)
aspec.write_as_hdf5(hdf5_path) # write_metadata_annotation

Expand Down Expand Up @@ -562,7 +567,6 @@ def _refinement_learn(spectra_files: list[Path], config: Config):

def _calculate_features(spectra_file: Path, config: Config):
library = _ce_calib(spectra_file, config)

calc_feature_step = ProcessStep(config.output, "calculate_features." + spectra_file.stem)
if calc_feature_step.is_done():
return
Expand Down Expand Up @@ -597,7 +601,8 @@ def _calculate_features(spectra_file: Path, config: Config):
# produce percolator tab files
fdr_dir = config.output / "results" / config.fdr_estimation_method
fdr_dir.mkdir(exist_ok=True)

add_neutral_loss_features = config.ptm_use_neutral_loss
remove_miss_cleavage_features = ("R" in config.ptm_possible_sites) or ("K" in config.ptm_possible_sites)
re.generate_features(
library=library,
search_type="original",
Expand All @@ -613,6 +618,8 @@ def _calculate_features(spectra_file: Path, config: Config):
additional_columns=config.use_feature_cols,
all_features=config.all_features,
regression_method=config.curve_fitting_method,
add_neutral_loss_features=add_neutral_loss_features,
remove_miss_cleavage_features=remove_miss_cleavage_features,
)

calc_feature_step.mark_done()
Expand All @@ -634,8 +641,13 @@ def _rescore(fdr_dir: Path, config: Config):
re.rescore_with_percolator(input_file=fdr_dir / "original.tab", output_folder=fdr_dir)
rescore_original_step.mark_done()
if not rescore_prosit_step.is_done():
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
rescore_prosit_step.mark_done()
logger.info("Start percolator rescoring")
logger.info(config.ptm_localization)
if config.ptm_localization:
_ptm_localization_rescore(fdr_dir, config)
else:
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
rescore_prosit_step.mark_done()
elif config.fdr_estimation_method == "mokapot":
if not rescore_original_step.is_done():
re.rescore_with_mokapot(input_file=fdr_dir / "original.tab", output_folder=fdr_dir)
Expand All @@ -649,6 +661,43 @@ def _rescore(fdr_dir: Path, config: Config):
)


def _ptm_localization_rescore(fdr_dir: Path, config: Config):
"""
Helper function for running percolator to do PTM localization.
:param fdr_dir: the output directory
:param config: the configuration object
"""
df_rescore = pd.read_csv(fdr_dir / "rescore.tab", sep="\t")
unimod_id = config.ptm_unimod_id
df_rescore["id"] = df_rescore["filename"] + df_rescore["ScanNr"].astype(str)
df_rescore_fil = df_rescore[df_rescore["Peptide"].str.contains("UNIMOD:" + str(unimod_id))]
df_rescore = df_rescore[df_rescore["id"].isin(df_rescore_fil["id"].tolist())]
df_rescore.drop(columns=["id"], inplace=True)
df_rescore.to_csv(fdr_dir / "rescore.tab", sep="\t", index=False)
new_rescore_dir = fdr_dir / "localize_mod"
new_rescore_dir.mkdir(parents=True, exist_ok=True)

if unimod_id == 7:
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
df_rescore_psms_targets = pd.read_csv(fdr_dir / "rescore.percolator.psms.txt", sep="\t")
df_rescore_psms_decoys = pd.read_csv(fdr_dir / "rescore.percolator.decoy.psms.txt", sep="\t")
df_rescore_psms_targets = df_rescore_psms_targets[
df_rescore_psms_targets["peptide"].apply(lambda x: "UNIMOD:" + str(unimod_id) in x)
]
df_rescore_psms_decoys = df_rescore_psms_decoys[
df_rescore_psms_decoys["peptide"].apply(lambda x: "UNIMOD:" + str(unimod_id) in x)
]
df_rescore_psms = pd.concat([df_rescore_psms_targets, df_rescore_psms_decoys])
df_rescore_psms = df_rescore_psms[["PSMId"]]
df_rescore_psms.rename(columns={"PSMId": "SpecId"}, inplace=True)
df_rescore = df_rescore.merge(df_rescore_psms, on="SpecId", how="inner")
df_rescore.to_csv(new_rescore_dir / "rescore.tab", sep="\t", index=False)
re.rescore_with_percolator(input_file=new_rescore_dir / "rescore.tab", output_folder=new_rescore_dir)
else:
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=new_rescore_dir)


def run_rescoring(config_path: Union[str, Path]):
"""
Create a ReScore object and run the rescoring.
Expand Down Expand Up @@ -717,7 +766,8 @@ def run_rescoring(config_path: Union[str, Path]):

# plotting
logger.info("Generating summary plots...")
pl.plot_all(fdr_dir)
if not config.ptm_localization:
pl.plot_all(fdr_dir)
logger.info("Finished rescoring.")

if config.quantification:
Expand Down
30 changes: 30 additions & 0 deletions oktoberfest/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,36 @@ def nr_ox(self) -> int:
"""Get the maximum number of oxidations allowed on M residues in peptides during spectral library generation."""
return self.spec_lib_options.get("nrOx", 1)

###########################
# these are PTM localization options #
###########################

@property
def ptm_localization(self) -> bool:
"""Get ptm localization flag from the config file."""
return self.data.get("ptm_localization", False)

@property
def ptm_localization_options(self) -> dict:
"""Get ptm localization dictionary from the config file."""
return self.data.get("ptmLocalizationOptions", {})

@property
def ptm_unimod_id(self) -> int:
"""Get the unimod id required for localization."""
unimod_id = self.ptm_localization_options.get("unimod_id", 0)
return unimod_id

@property
def ptm_possible_sites(self) -> list:
"""Get which sites ptm can exist on."""
return self.ptm_localization_options.get("possible_sites", [])

@property
def ptm_use_neutral_loss(self) -> bool:
"""Get neutral loss flag to indicate whether to add a score for this or not."""
return self.ptm_localization_options.get("neutral_loss", False)

#####################################################################
# these are local prediction / transfer&refinement learning options #
#####################################################################
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ matplotlib = "^3.6.3"
mokapot = ">=0.9.1,<0.11.0"
numpy = ">=1.20,<1.25"
seaborn = ">=0.12.2,<0.14.0"
spectrum_fundamentals = ">=0.7.3,<0.8.0"
spectrum-io = ">=0.6.2,<0.7.0"
spectrum_fundamentals = ">=0.7.4,<0.8.0"
spectrum-io = ">=0.6.3,<0.7.0"
picked_group_fdr = ">=0.7.1"

dlomix = {extras = ["rltl-report", "wandb"], git = "[email protected]:wilhelm-lab/dlomix.git", branch = "feature/bmpc", optional = true}
Expand Down

0 comments on commit 0005103

Please sign in to comment.