Skip to content

Commit

Permalink
Merge pull request #20 from RECETOX/issue17
Browse files Browse the repository at this point in the history
First iteration refactoring of the python scripts
  • Loading branch information
hechth authored Nov 14, 2023
2 parents adf8208 + 1992e49 commit 48606b6
Show file tree
Hide file tree
Showing 5 changed files with 867 additions and 538 deletions.
27 changes: 11 additions & 16 deletions analysis/Python_scripts/add_zeros.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,40 @@
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from matchms.importing import load_from_msp\n",
"from matchms.logging_functions import set_matchms_logger_level\n",
"import itertools\n",
"from utils import load_spectra_metadata\n",
"\n",
"set_matchms_logger_level(\"ERROR\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"predicted_spectra = list(load_from_msp(\"../data/filtered/simulated_matchms_filter_1%I_all_peaks.msp\"))\n",
"predicted_spectra_metadata= pd.DataFrame.from_dict([x.metadata for x in predicted_spectra])\n",
"predicted_spectra_metadata.rename(columns={'compound_name': 'query'}, inplace=True)\n",
"predicted_spectra_names = predicted_spectra_metadata['query'].to_list()"
"predicted_spectra, predicted_spectra_metadata, predicted_spectra_names = load_spectra_metadata(\"../data/filtered/simulated_matchms_filter_1%I_all_peaks.msp\", 'query')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"experimental_spectra = list(load_from_msp(\"../data/experimental/RECETOX_GC-EI_MS_20201028.msp\"))\n",
"experimental_spectra_metadata= pd.DataFrame.from_dict([x.metadata for x in experimental_spectra])\n",
"experimental_spectra_metadata.rename(columns={'compound_name': 'reference'}, inplace=True)\n",
"experimental_spectra_names = experimental_spectra_metadata['reference'].to_list()"
"experimental_spectra, experimental_spectra_metadata, experimental_spectra_names = load_spectra_metadata(\"../data/experimental/RECETOX_GC-EI_MS_20201028.msp\", 'reference')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -49,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -59,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -69,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -79,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
Expand Down
879 changes: 545 additions & 334 deletions analysis/Python_scripts/scatterplot.ipynb

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions analysis/Python_scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
from rdkit import Chem
from itertools import combinations
from matchms.importing import load_from_msp

def is_spectrum_for_compound(compund_name, spectrum_name):
options = [compund_name + x for x in ["", "_isomer1", "_isomer2", " isomer 1", " isomer 2"]]
Expand All @@ -11,19 +12,47 @@ def get_matching_rows(df, query_name, reference_name):
return df[df.apply(lambda x: is_spectrum_for_compound(x[query_name], x[reference_name]), axis=1)]

def has_halogen_atoms(mol):
"""
Check if a molecule contains any halogen atoms.
Parameters:
- mol (Chem.Mol): RDKit molecule object.
Returns:
- bool: True if the molecule has halogen atoms, False otherwise.
"""
# Check if the molecule contains any halogen atoms
for atom in mol.GetAtoms():
if atom.GetSymbol() in ['F', 'Cl', 'Br', 'I']:
return True
return False

def has_atom(mol, atom):
"""
Check if a molecule contains a specific type of atom.
Parameters:
- mol (Chem.Mol): RDKit molecule object.
- atom (str): Symbol of the atom to check.
Returns:
- bool: True if the molecule contains the specified atom, False otherwise.
"""
for mol_atom in mol.GetAtoms():
if mol_atom.GetSymbol() == atom:
return True
return False

def has_organic_atoms(mol):
"""
Check if a molecule contains any organic atoms (C, O, N, H).
Parameters:
- mol (Chem.Mol): RDKit molecule object.
Returns:
- bool: True if the molecule has organic atoms, False otherwise.
"""
# Check if the molecule contains any halogen atoms
for atom in mol.GetAtoms():
if atom.GetSymbol() in ['C', 'O', 'N', 'H']:
Expand All @@ -32,6 +61,16 @@ def has_organic_atoms(mol):
return False

def append_classes(df, left_on):
"""
Append molecular classes information to a DataFrame based on a specified column.
Parameters:
- df (pd.DataFrame): The input DataFrame.
- left_on (str): The column to merge on.
Returns:
- pd.DataFrame: The input DataFrame with additional molecular classes information.
"""
molecules = Chem.SDMolSupplier("../../data/RECETOX_GC-EI-MS_20201028.sdf")
class_names = pd.DataFrame({
"class" : [m.GetProp("Class") for m in molecules],
Expand All @@ -55,16 +94,47 @@ def append_classes(df, left_on):

# Define a function to map the true columns to a list of names
def get_true_names(row, df):
"""
Map true columns to a list of names for a given row.
Parameters:
- row: The row in the DataFrame.
- df (pd.DataFrame): The DataFrame.
Returns:
- list: List of true column names for the given row.
"""
return [col for col in df.columns[11:18] if row[col]]

# Function to split values with commas and create new rows
def split_and_add_rows(df, column_name, split_by):
"""
Split values in a DataFrame column by a specified delimiter and create new rows.
Parameters:
- df (pd.DataFrame): The input DataFrame.
- column_name (str): The column to split and explode.
- split_by (str): The delimiter to split values.
Returns:
- pd.DataFrame: DataFrame with additional rows after splitting and exploding the specified column.
"""
df_copy = df.copy()
df_copy[column_name] = df_copy[column_name].str.split(split_by)
df_copy = df_copy.explode(column_name).reset_index(drop=True)
return df_copy

def generate_combinations(df, column_name):
"""
Generate combinations of values in a DataFrame column and create new rows.
Parameters:
- df (pd.DataFrame): The input DataFrame.
- column_name (str): The column to generate combinations for.
Returns:
- pd.DataFrame: DataFrame with additional rows after generating combinations for the specified column.
"""
new_rows = []
for index, row in df.iterrows():
values = row[column_name].split(', ')
Expand Down Expand Up @@ -99,3 +169,10 @@ def preprocess_data(merged_top5_same):
mdf = mdf.dropna(subset=['value', 'true_names'])

return mdf

def load_spectra_metadata(file_path, metadata_column_name):
spectra = list(load_from_msp(file_path))
spectra_metadata = pd.DataFrame.from_dict([x.metadata for x in spectra])
spectra_metadata.rename(columns={'compound_name': metadata_column_name}, inplace=True)
spectra_names = spectra_metadata[metadata_column_name].to_list()
return spectra, spectra_metadata, spectra_names
Loading

0 comments on commit 48606b6

Please sign in to comment.