Merge pull request #10 from NREL/psj-speedup

minimal edits to get faster fragmenting
NREL · Aug 8, 2022 · fce4496 · fce4496
2 parents 8ecbe4b + 14970d6
commit fce4496
Showing 3 changed files with 84 additions and 53 deletions.
diff --git a/alfabet/__init__.py b/alfabet/__init__.py
@@ -1,5 +1,10 @@
+from rdkit import RDLogger
+
 from . import _version
 
+RDLogger.DisableLog("rdApp.*")
+
+
 __version__ = _version.get_versions()["version"]
 
 _model_tag = "v0.1.1"  # Tag on https://github.com/pstjohn/alfabet-models/

diff --git a/alfabet/fragment.py b/alfabet/fragment.py
@@ -1,29 +1,65 @@
 import logging
 from collections import Counter
-from typing import Iterator
+from typing import Dict, Iterator, Type
 
 import pandas as pd
 import rdkit
 import rdkit.Chem
 import rdkit.Chem.AllChem
+from rdkit import RDLogger
 
+RDLogger.DisableLog("rdApp.*")
 
-def get_fragments(smiles: str) -> pd.DataFrame:
-    return pd.DataFrame(fragment_iterator(smiles))
 
+class Molecule:
+    def __init__(self, mol: Type[rdkit.Chem.Mol] = None, smiles: str = None) -> None:
+        assert (mol is not None) or (
+            smiles is not None
+        ), "mol or smiles must be provided"
 
-def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[pd.Series]:
-    mol_stereo = count_stereocenters(smiles)
+        self._mol = mol
+        self._smiles = smiles
+        self._molH = None
+        self._is_canon = False
+
+    @property
+    def mol(self) -> Type[rdkit.Chem.Mol]:
+        if self._mol is None:
+            self._mol = rdkit.Chem.MolFromSmiles(self._smiles)
+        return self._mol
+
+    @property
+    def molH(self) -> Type[rdkit.Chem.Mol]:
+        if self._molH is None:
+            self._molH = rdkit.Chem.AddHs(self.mol)
+        return self._molH
+
+    @property
+    def smiles(self) -> str:
+        if (self._smiles is None) or not self._is_canon:
+            self._smiles = rdkit.Chem.MolToSmiles(self._mol)
+        return self._smiles
+
+
+def get_fragments(smiles: str, drop_duplicates: bool = False) -> pd.DataFrame:
+    df = pd.DataFrame(fragment_iterator(smiles))
+    if drop_duplicates:
+        df = df.drop_duplicates(["fragment1", "fragment2"]).reset_index(drop=True)
+    return df
+
+
+def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[Dict]:
+
+    input_molecule = Molecule(smiles=smiles)
+    mol_stereo = count_stereocenters(input_molecule)
     if (mol_stereo["atom_unassigned"] != 0) or (mol_stereo["bond_unassigned"] != 0):
         logging.warning(f"Molecule {smiles} has undefined stereochemistry")
         if skip_warnings:
             return
 
-    mol = rdkit.Chem.MolFromSmiles(smiles)
-    mol = rdkit.Chem.rdmolops.AddHs(mol)
-    rdkit.Chem.Kekulize(mol, clearAromaticFlags=True)
+    rdkit.Chem.Kekulize(input_molecule.molH, clearAromaticFlags=True)
 
-    for bond in mol.GetBonds():
+    for bond in input_molecule.molH.GetBonds():
 
         if bond.IsInRing():
             continue
@@ -34,7 +70,7 @@ def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[pd.S
         try:
 
             # Use RDkit to break the given bond
-            mh = rdkit.Chem.RWMol(mol)
+            mh = rdkit.Chem.RWMol(input_molecule.molH)
             a1 = bond.GetBeginAtomIdx()
             a2 = bond.GetEndAtomIdx()
             mh.RemoveBond(a1, a2)
@@ -50,29 +86,27 @@ def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[pd.S
 
             # Split fragment and canonicalize
             frag1, frag2 = sorted(fragmented_smiles.split("."))
-            frag1 = canonicalize_smiles(frag1)
-            frag2 = canonicalize_smiles(frag2)
+            frag1 = Molecule(smiles=frag1)
+            frag2 = Molecule(smiles=frag2)
 
             # Stoichiometry check
             assert (
                 count_atom_types(frag1) + count_atom_types(frag2)
-            ) == count_atom_types(smiles), "Error with {}; {}; {}".format(
-                frag1, frag2, smiles
+            ) == count_atom_types(input_molecule), "Error with {}; {}; {}".format(
+                frag1.smiles, frag2.smiles, input_molecule.smiles
             )
 
             # Check introduction of new stereocenters
             is_valid_stereo = check_stereocenters(frag1) and check_stereocenters(frag2)
 
-            yield pd.Series(
-                {
-                    "molecule": smiles,
-                    "bond_index": bond.GetIdx(),
-                    "bond_type": get_bond_type(bond),
-                    "fragment1": frag1,
-                    "fragment2": frag2,
-                    "is_valid_stereo": is_valid_stereo,
-                }
-            )
+            yield {
+                "molecule": input_molecule.smiles,
+                "bond_index": bond.GetIdx(),
+                "bond_type": get_bond_type(bond),
+                "fragment1": frag1.smiles,
+                "fragment2": frag2.smiles,
+                "is_valid_stereo": is_valid_stereo,
+            }
 
         except ValueError:
             logging.error(
@@ -81,30 +115,23 @@ def fragment_iterator(smiles: str, skip_warnings: bool = False) -> Iterator[pd.S
             continue
 
 
-def count_atom_types(smiles):
+def count_atom_types(molecule: Type[Molecule]):
     """Return a dictionary of each atom type in the given fragment or molecule"""
-    mol = rdkit.Chem.MolFromSmiles(smiles, sanitize=True)
-    mol = rdkit.Chem.rdmolops.AddHs(mol)
-    return Counter([atom.GetSymbol() for atom in mol.GetAtoms()])
-
+    return Counter([atom.GetSymbol() for atom in molecule.molH.GetAtoms()])
 
-def canonicalize_smiles(smiles: str) -> str:
-    """Return a consistent SMILES representation for the given molecule"""
-    mol = rdkit.Chem.MolFromSmiles(smiles)
-    return rdkit.Chem.MolToSmiles(mol)
 
-
-def count_stereocenters(smiles: str) -> pd.Series:
+def count_stereocenters(molecule: Type[Molecule]) -> Dict:
     """Returns a count of both assigned and unassigned stereocenters in the
     given molecule"""
 
-    mol = rdkit.Chem.MolFromSmiles(smiles)
-    rdkit.Chem.FindPotentialStereoBonds(mol)
+    rdkit.Chem.FindPotentialStereoBonds(molecule.mol)
 
-    stereocenters = rdkit.Chem.FindMolChiralCenters(mol, includeUnassigned=True)
+    stereocenters = rdkit.Chem.FindMolChiralCenters(
+        molecule.mol, includeUnassigned=True
+    )
     stereobonds = [
         bond
-        for bond in mol.GetBonds()
+        for bond in molecule.mol.GetBonds()
         if bond.GetStereo() is not rdkit.Chem.rdchem.BondStereo.STEREONONE
     ]
 
@@ -126,21 +153,19 @@ def count_stereocenters(smiles: str) -> pd.Series:
         ]
     )
 
-    return pd.Series(
-        {
-            "atom_assigned": atom_assigned,
-            "atom_unassigned": atom_unassigned,
-            "bond_assigned": bond_assigned,
-            "bond_unassigned": bond_unassigned,
-        }
-    )
+    return {
+        "atom_assigned": atom_assigned,
+        "atom_unassigned": atom_unassigned,
+        "bond_assigned": bond_assigned,
+        "bond_unassigned": bond_unassigned,
+    }
 
 
-def check_stereocenters(smiles):
+def check_stereocenters(molecule: Type[Molecule]):
     """Check the given SMILES string to determine whether accurate
     enthalpies can be calculated with the given stereochem information
     """
-    stereocenters = count_stereocenters(smiles)
+    stereocenters = count_stereocenters(molecule)
     if stereocenters["bond_unassigned"] > 0:
         return False
 

diff --git a/alfabet/model.py b/alfabet/model.py
@@ -46,11 +46,12 @@ def predict(smiles_list, drop_duplicates=True, batch_size=1):
                    domain of validity
     """
 
-    pred_df = pd.concat((get_fragments(smiles) for smiles in smiles_list))
-    if drop_duplicates:
-        pred_df = pred_df.drop_duplicates(["fragment1", "fragment2"]).reset_index(
-            drop=True
+    pred_df = pd.concat(
+        (
+            get_fragments(smiles, drop_duplicates=drop_duplicates)
+            for smiles in smiles_list
         )
+    )
 
     input_dataset = tf.data.Dataset.from_generator(
         lambda: (