Skip to content

Commit

Permalink
Merge pull request #10 from NREL/psj-speedup
Browse files Browse the repository at this point in the history
minimal edits to get faster fragmenting
pstjohn authored Aug 8, 2022
2 parents 8ecbe4b + 14970d6 commit fce4496
Showing 3 changed files with 84 additions and 53 deletions.
5 changes: 5 additions & 0 deletions alfabet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from rdkit import RDLogger

from . import _version

RDLogger.DisableLog("rdApp.*")


__version__ = _version.get_versions()["version"]

_model_tag = "v0.1.1" # Tag on https://github.com/pstjohn/alfabet-models/
123 changes: 74 additions & 49 deletions alfabet/fragment.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,65 @@
import logging
from collections import Counter
from typing import Iterator
from typing import Dict, Iterator, Type

import pandas as pd
import rdkit
import rdkit.Chem
import rdkit.Chem.AllChem
from rdkit import RDLogger

RDLogger.DisableLog("rdApp.*")

def get_fragments(smiles: str) -> pd.DataFrame:
return pd.DataFrame(fragment_iterator(smiles))

class Molecule:
def __init__(self, mol: Type[rdkit.Chem.Mol] = None, smiles: str = None) -> None:
assert (mol is not None) or (
smiles is not None
), "mol or smiles must be provided"

def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[pd.Series]:
mol_stereo = count_stereocenters(smiles)
self._mol = mol
self._smiles = smiles
self._molH = None
self._is_canon = False

@property
def mol(self) -> Type[rdkit.Chem.Mol]:
if self._mol is None:
self._mol = rdkit.Chem.MolFromSmiles(self._smiles)
return self._mol

@property
def molH(self) -> Type[rdkit.Chem.Mol]:
if self._molH is None:
self._molH = rdkit.Chem.AddHs(self.mol)
return self._molH

@property
def smiles(self) -> str:
if (self._smiles is None) or not self._is_canon:
self._smiles = rdkit.Chem.MolToSmiles(self._mol)
return self._smiles


def get_fragments(smiles: str, drop_duplicates: bool = False) -> pd.DataFrame:
df = pd.DataFrame(fragment_iterator(smiles))
if drop_duplicates:
df = df.drop_duplicates(["fragment1", "fragment2"]).reset_index(drop=True)
return df


def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[Dict]:

input_molecule = Molecule(smiles=smiles)
mol_stereo = count_stereocenters(input_molecule)
if (mol_stereo["atom_unassigned"] != 0) or (mol_stereo["bond_unassigned"] != 0):
logging.warning(f"Molecule {smiles} has undefined stereochemistry")
if skip_warnings:
return

mol = rdkit.Chem.MolFromSmiles(smiles)
mol = rdkit.Chem.rdmolops.AddHs(mol)
rdkit.Chem.Kekulize(mol, clearAromaticFlags=True)
rdkit.Chem.Kekulize(input_molecule.molH, clearAromaticFlags=True)

for bond in mol.GetBonds():
for bond in input_molecule.molH.GetBonds():

if bond.IsInRing():
continue
@@ -34,7 +70,7 @@ def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[pd.S
try:

# Use RDkit to break the given bond
mh = rdkit.Chem.RWMol(mol)
mh = rdkit.Chem.RWMol(input_molecule.molH)
a1 = bond.GetBeginAtomIdx()
a2 = bond.GetEndAtomIdx()
mh.RemoveBond(a1, a2)
@@ -50,29 +86,27 @@ def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[pd.S

# Split fragment and canonicalize
frag1, frag2 = sorted(fragmented_smiles.split("."))
frag1 = canonicalize_smiles(frag1)
frag2 = canonicalize_smiles(frag2)
frag1 = Molecule(smiles=frag1)
frag2 = Molecule(smiles=frag2)

# Stoichiometry check
assert (
count_atom_types(frag1) + count_atom_types(frag2)
) == count_atom_types(smiles), "Error with {}; {}; {}".format(
frag1, frag2, smiles
) == count_atom_types(input_molecule), "Error with {}; {}; {}".format(
frag1.smiles, frag2.smiles, input_molecule.smiles
)

# Check introduction of new stereocenters
is_valid_stereo = check_stereocenters(frag1) and check_stereocenters(frag2)

yield pd.Series(
{
"molecule": smiles,
"bond_index": bond.GetIdx(),
"bond_type": get_bond_type(bond),
"fragment1": frag1,
"fragment2": frag2,
"is_valid_stereo": is_valid_stereo,
}
)
yield {
"molecule": input_molecule.smiles,
"bond_index": bond.GetIdx(),
"bond_type": get_bond_type(bond),
"fragment1": frag1.smiles,
"fragment2": frag2.smiles,
"is_valid_stereo": is_valid_stereo,
}

except ValueError:
logging.error(
@@ -81,30 +115,23 @@ def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[pd.S
continue


def count_atom_types(smiles):
def count_atom_types(molecule: Type[Molecule]):
"""Return a dictionary of each atom type in the given fragment or molecule"""
mol = rdkit.Chem.MolFromSmiles(smiles, sanitize=True)
mol = rdkit.Chem.rdmolops.AddHs(mol)
return Counter([atom.GetSymbol() for atom in mol.GetAtoms()])

return Counter([atom.GetSymbol() for atom in molecule.molH.GetAtoms()])

def canonicalize_smiles(smiles: str) -> str:
"""Return a consistent SMILES representation for the given molecule"""
mol = rdkit.Chem.MolFromSmiles(smiles)
return rdkit.Chem.MolToSmiles(mol)


def count_stereocenters(smiles: str) -> pd.Series:
def count_stereocenters(molecule: Type[Molecule]) -> Dict:
"""Returns a count of both assigned and unassigned stereocenters in the
given molecule"""

mol = rdkit.Chem.MolFromSmiles(smiles)
rdkit.Chem.FindPotentialStereoBonds(mol)
rdkit.Chem.FindPotentialStereoBonds(molecule.mol)

stereocenters = rdkit.Chem.FindMolChiralCenters(mol, includeUnassigned=True)
stereocenters = rdkit.Chem.FindMolChiralCenters(
molecule.mol, includeUnassigned=True
)
stereobonds = [
bond
for bond in mol.GetBonds()
for bond in molecule.mol.GetBonds()
if bond.GetStereo() is not rdkit.Chem.rdchem.BondStereo.STEREONONE
]

@@ -126,21 +153,19 @@ def count_stereocenters(smiles: str) -> pd.Series:
]
)

return pd.Series(
{
"atom_assigned": atom_assigned,
"atom_unassigned": atom_unassigned,
"bond_assigned": bond_assigned,
"bond_unassigned": bond_unassigned,
}
)
return {
"atom_assigned": atom_assigned,
"atom_unassigned": atom_unassigned,
"bond_assigned": bond_assigned,
"bond_unassigned": bond_unassigned,
}


def check_stereocenters(smiles):
def check_stereocenters(molecule: Type[Molecule]):
"""Check the given SMILES string to determine whether accurate
enthalpies can be calculated with the given stereochem information
"""
stereocenters = count_stereocenters(smiles)
stereocenters = count_stereocenters(molecule)
if stereocenters["bond_unassigned"] > 0:
return False

9 changes: 5 additions & 4 deletions alfabet/model.py
Original file line number Diff line number Diff line change
@@ -46,11 +46,12 @@ def predict(smiles_list, drop_duplicates=True, batch_size=1):
domain of validity
"""

pred_df = pd.concat((get_fragments(smiles) for smiles in smiles_list))
if drop_duplicates:
pred_df = pred_df.drop_duplicates(["fragment1", "fragment2"]).reset_index(
drop=True
pred_df = pd.concat(
(
get_fragments(smiles, drop_duplicates=drop_duplicates)
for smiles in smiles_list
)
)

input_dataset = tf.data.Dataset.from_generator(
lambda: (

0 comments on commit fce4496

Please sign in to comment.