Skip to content

Commit

Permalink
Merge pull request #22 from MolecularAI/route-comp-updates
Browse files Browse the repository at this point in the history
Route comparison and scoring updates
  • Loading branch information
SGenheden authored Oct 11, 2024
2 parents 6445f50 + 67059ce commit c809033
Show file tree
Hide file tree
Showing 53 changed files with 3,503 additions and 425 deletions.
680 changes: 361 additions & 319 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "reaction_utils"
version = "1.6.0"
version = "1.7.0"
description = "Utilities for working with reactions, reaction templates and template extraction"
authors = ["Genheden, Samuel <[email protected]>", "Kannas, Christos <[email protected]>"]
license = "Apache-2.0"
Expand Down Expand Up @@ -28,6 +28,8 @@ numpy = "^1.0.0"
rdkit = "^2023.9.1"
cgrtools = "^4.1.35"
scipy = "^1.11.4"
pydantic = "^2.8.2"
apted = "^1.0.3"

[tool.poetry.dev-dependencies]
pytest = "^6.2.2"
Expand Down
22 changes: 22 additions & 0 deletions rxnutils/chem/augmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
""" Routines for augmenting chemical reactions
"""

_SINGLE_REACTANT_REAGENTS = {"10.1.1": "Br", "10.1.2": "Cl"}


def single_reactant_augmentation(smiles: str, classification: str) -> str:
"""
Augment single-reactant reaction with additional reagent if possible
based on the classification of the reaction
:param smiles: the reaction SMILES to augment
:param classification: the classification of the reaction or an empty string
:return: the processed SMILES
"""
reactants = smiles.split(">")[0]
if "." in reactants:
return smiles
classification = classification.split(" ")[0]
new_reactant = _SINGLE_REACTANT_REAGENTS.get(classification)
if new_reactant:
return new_reactant + "." + smiles
return smiles
4 changes: 2 additions & 2 deletions rxnutils/chem/cgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import warnings
from typing import List

from CGRtools.files.SDFrw import SDFRead
from CGRtools.containers.reaction import ReactionContainer
from CGRtools.containers.molecule import MoleculeContainer
from CGRtools.containers.reaction import ReactionContainer
from CGRtools.files.SDFrw import SDFRead
from rdkit import Chem

from rxnutils.chem.reaction import ChemicalReaction
Expand Down
24 changes: 24 additions & 0 deletions rxnutils/chem/disconnection_sites/atom_map_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,30 @@ def get_atom_list(reactants_smiles: str, product_smiles: str) -> List[int]:
return atom_list


def atom_map_tag_reactants(mapped_rxn: str) -> str:
"""
Given atom-mapped reaction, returns disconnection site-tagged reactants where atoms
with changed atom environment are represented by [<atom>:1].
:param mapped_rxn: Atom-mapped reaction SMILES
:return: SMILES of the reactants containing tags corresponding to atoms changed in the
reaction.
"""
reactants_smiles, _, product_smiles = mapped_rxn.split(">")

reactants_mol = Chem.MolFromSmiles(reactants_smiles)
atom_list = get_atom_list(reactants_smiles, product_smiles)

# Set atoms in product with a different environment in reactants to 1
for atom in reactants_mol.GetAtoms():
if atom.GetAtomMapNum() in atom_list:
atom.SetAtomMapNum(1)
else:
atom.SetAtomMapNum(0)

return Chem.MolToSmiles(reactants_mol)


def atom_map_tag_products(mapped_rxn: str) -> str:
"""
Given atom-mapped reaction, returns disconnection site-tagged product where atoms
Expand Down
12 changes: 8 additions & 4 deletions rxnutils/chem/disconnection_sites/tag_converting.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,16 @@ def smiles_tokens(smiles: str) -> List[str]:
:param smiles: SMILES to tokenize
:return: List of tokens identified in SMILES.
"""
pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
regex = re.compile(pattern)
tokens = [token for token in regex.findall(smiles)]
assert smiles == "".join(tokens)

tokenized_smiles = "".join(tokens)
if smiles != tokenized_smiles:
raise AssertionError(
f"tokenized SMILES not the same as input SMILES: {tokenized_smiles}, "
"{smiles}, tokens: {tokens}"
)
return tokens


Expand Down Expand Up @@ -68,8 +74,6 @@ def tagged_smiles_from_tokens(
reaction using "<atom>!", and SMILES of the (reconstructed) untagged product
"""

print(product_tagged_tokens)

product_converted = ""
product_untagged = ""

Expand Down
11 changes: 4 additions & 7 deletions rxnutils/chem/reaction.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
"""Module containing a class to handle chemical reactions"""
import hashlib
from typing import List, Tuple, Optional, Dict, Any
from typing import Any, Dict, List, Optional, Tuple

import wrapt_timeout_decorator
from rdchiral import template_extractor as extractor
from rdkit import Chem
from rdkit.Chem import AllChem
from rdchiral import template_extractor as extractor

from rxnutils.chem.rinchi import rinchi_api
from rxnutils.chem import utils
from rxnutils.chem.rinchi import rinchi_api
from rxnutils.chem.template import ReactionTemplate
from rxnutils.chem.utils import (
reassign_rsmi_atom_mapping,
split_smiles_from_reaction,
)
from rxnutils.chem.utils import reassign_rsmi_atom_mapping, split_smiles_from_reaction


class ReactionException(Exception):
Expand Down
4 changes: 2 additions & 2 deletions rxnutils/chem/rinchi/download_rinchi.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Module for downloading InChI Trust Reaction InChI."""
import logging
import os
import sys
import stat
import sys
from zipfile import ZipFile
import logging

import requests

Expand Down
5 changes: 2 additions & 3 deletions rxnutils/chem/rinchi/rinchi_api.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
"""Module containing an API to the Reaction InChI program"""
import logging
import os
import sys
import subprocess
import sys
import tempfile
from collections import namedtuple

from rdkit.Chem import AllChem

from rxnutils.chem.rinchi import download_rinchi
from rxnutils.chem.rinchi.download_rinchi import RInChIError, PLATFORM2FOLDER

from rxnutils.chem.rinchi.download_rinchi import PLATFORM2FOLDER, RInChIError

RInChIStructure = namedtuple(
"RInChI", "rinchi rauxinfo long_rinchikey short_rinchikey web_rinchikey"
Expand Down
7 changes: 3 additions & 4 deletions rxnutils/chem/template.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
"""Module containing useful representations of templates
"""
import re
import hashlib
import logging
import re
from collections import defaultdict
from itertools import permutations
from typing import List, Dict, Set, Iterator, Tuple, Any
from typing import Any, Dict, Iterator, List, Set, Tuple

import numpy as np
import rdchiral.main as rdc
from xxhash import xxh32
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, SanitizeFlags # pylint: disable=all

from xxhash import xxh32

DELIM_REGEX_STR = r"[&:\]]"
AROMATIC_REGEX_STR = r"&a" + DELIM_REGEX_STR
Expand Down
7 changes: 2 additions & 5 deletions rxnutils/chem/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
"""Module containing various chemical utility routines"""

import logging
import functools
import logging
from typing import List, Tuple


import rdchiral.template_extractor

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.MolStandardize import rdMolStandardize
Expand Down Expand Up @@ -292,7 +289,7 @@ def get_special_groups(mol) -> List[Tuple[Tuple[int, ...], Tuple[int, ...]]]:

# Build list
groups = []
for add_if_match, template in group_templates:
for (add_if_match, template) in group_templates:
matches = mol.GetSubstructMatches(
Chem.MolFromSmarts(template), useChirality=True
)
Expand Down
7 changes: 3 additions & 4 deletions rxnutils/data/base_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
"""Module containing base class for data pipelines
"""
import os
import math
import os
from pathlib import Path
from typing import List, Tuple

import pandas as pd
from metaflow import FlowSpec, Parameter

from typing import List, Tuple

from rxnutils.data.batch_utils import create_csv_batches, combine_csv_batches
from rxnutils.data.batch_utils import combine_csv_batches, create_csv_batches

# This is hack to only import the validation_runner if rxnmapper is not installed
try:
Expand Down
2 changes: 1 addition & 1 deletion rxnutils/data/mapping_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
from pathlib import Path

from metaflow import step, Parameter
from metaflow import Parameter, step

from rxnutils.data.base_pipeline import DataBaseFlow
from rxnutils.data.mapping import main as map_data
Expand Down
2 changes: 1 addition & 1 deletion rxnutils/data/ord/import_ord_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""
Module containing script to import ORD dataset to a CSV file
"""
import re
import argparse
import os
import re
from collections import defaultdict
from typing import Optional, Sequence

Expand Down
2 changes: 1 addition & 1 deletion rxnutils/data/ord/preparation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os
from pathlib import Path

from metaflow import step, Parameter
from metaflow import Parameter, step

from rxnutils.data.base_pipeline import DataPreparationBaseFlow
from rxnutils.data.ord.import_ord_dataset import main as import_data
Expand Down
18 changes: 17 additions & 1 deletion rxnutils/data/uspto/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
* preserve the ReactionSmiles and Year columns
* create an ID from PatentNumber and ParagraphNum and row index in the original file
"""

import argparse
from pathlib import Path
from typing import Optional, Sequence

import pandas as pd

from rxnutils.data.uspto.uspto_yield import UsptoYieldCuration

DEFAULT_FILENAMES = [
"1976_Sep2016_USPTOgrants_smiles.rsmi",
"2001_Sep2016_USPTOapplications_smiles.rsmi",
Expand All @@ -29,6 +32,12 @@ def main(args: Optional[Sequence[str]] = None) -> None:
"--output", default="uspto_data.csv", help="the output filename"
)
parser.add_argument("--folder", default=".", help="folder with downloaded files")
parser.add_argument(
"--with_yields",
action="store_true",
default=False,
help="if to add yield columns",
)
args = parser.parse_args(args)

filenames = [Path(args.folder) / filename for filename in args.filenames]
Expand All @@ -42,11 +51,18 @@ def main(args: Optional[Sequence[str]] = None) -> None:
para_num = data["ParagraphNum"].fillna("")
row_num = data.index.astype(str)
data["ID"] = data["PatentNumber"] + ";" + para_num + ";" + row_num
data2 = data[["ID", "Year", "ReactionSmiles"]]
columns = ["ID", "Year", "ReactionSmiles"]
if args.with_yields:
columns += ["TextMinedYield", "CalculatedYield"]
data2 = data[columns]

print(f"Total number of unique IDs: {len(set(data2['ID']))}")
print(f"Total number of records: {len(data2)}")

if args.with_yields:
print("Curating yields...")
data2 = UsptoYieldCuration()(data2)

data2.to_csv(Path(args.folder) / args.output, sep="\t", index=False)


Expand Down
7 changes: 3 additions & 4 deletions rxnutils/data/uspto/download.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
"""Module containing a script to download USPTO files Figshare
"""
import os
import argparse
import os
from pathlib import Path
from typing import Optional, Sequence

import tqdm
import requests
import py7zr

import requests
import tqdm

FILES_TO_DOWNLOAD = [
{
Expand Down
3 changes: 2 additions & 1 deletion rxnutils/data/uspto/preparation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
Module containing pipeline for downloading, transforming and cleaning USPTO data
This needs to be run in an environment with rxnutils installed
"""

from pathlib import Path

from metaflow import step

from rxnutils.data.base_pipeline import DataPreparationBaseFlow
from rxnutils.data.uspto.download import main as download_uspto
from rxnutils.data.uspto.combine import main as combine_uspto
from rxnutils.data.uspto.download import main as download_uspto


class UsptoDataPreparationFlow(DataPreparationBaseFlow):
Expand Down
51 changes: 51 additions & 0 deletions rxnutils/data/uspto/uspto_yield.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Code for curating USPTO yields.
Inspiration from this code: https://github.com/DocMinus/Yield_curation_USPTO
This could potentially be an action, but since it only make sens to use it
with USPTO data, it resides here for now.
"""

from dataclasses import dataclass

import numpy as np
import pandas as pd


@dataclass
class UsptoYieldCuration:
"""
Action for curating USPTO yield columns
"""

text_yield_column: str = "TextMinedYield"
calc_yield_column: str = "CalculatedYield"
out_column: str = "CuratedYield"

def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
calc_yield = data[self.calc_yield_column].str.rstrip("%")
calc_yield = pd.to_numeric(calc_yield, errors="coerce")
calc_yield[(calc_yield < 0) | (calc_yield > 100)] = np.nan

text_yield = data[self.text_yield_column].str.lstrip("~")
text_yield = text_yield.str.rstrip("%")
text_yield = text_yield.str.replace(">=", "", regex=False)
text_yield = text_yield.str.replace(">", "", regex=False)
text_yield = text_yield.str.replace("<", "", regex=False)
text_yield = text_yield.str.replace(r"\d{1,2}\sto\s", "", regex=True)
text_yield = pd.to_numeric(text_yield, errors="coerce")
text_yield[(text_yield < 0) | (text_yield > 100)] = np.nan

curated_yield = text_yield.copy()

sel = (~calc_yield.isna()) & (~text_yield.isna())
curated_yield[sel] = np.maximum(calc_yield[sel], text_yield[sel])

sel = (~calc_yield.isna()) & (text_yield.isna())
curated_yield[sel] = calc_yield[sel]

return data.assign(**{self.out_column: curated_yield})

def __str__(self) -> str:
return f"{self.pretty_name} (create one column with curated yield values)"
Loading

0 comments on commit c809033

Please sign in to comment.