Skip to content

Commit

Permalink
Merge pull request #3 from NREL/nfp-0.3.3
Browse files Browse the repository at this point in the history
Update for new model files
  • Loading branch information
pstjohn authored Sep 22, 2021
2 parents f1d9d05 + 127a554 commit c466a1b
Show file tree
Hide file tree
Showing 14 changed files with 100 additions and 463 deletions.
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
include alfabet/model_files/*
include versioneer.py
include alfabet/_version.py
5 changes: 4 additions & 1 deletion alfabet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@

from . import _version

__version__ = _version.get_versions()['version']

_model_tag = 'v0.1' # Tag on https://github.com/pstjohn/alfabet-models/
_model_files_baseurl = f'https://github.com/pstjohn/alfabet-models/releases/download/{_model_tag}/'
3 changes: 2 additions & 1 deletion alfabet/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
# feature). Distribution tarballs (built by setup.py sdist) and build
Expand Down Expand Up @@ -58,12 +57,14 @@ class NotThisMethod(Exception):

def register_vcs_handler(vcs, method): # decorator
"""Create decorator to mark a method as the handler of a VCS."""

def decorate(f):
"""Store f in HANDLERS[vcs][method]."""
if vcs not in HANDLERS:
HANDLERS[vcs] = {}
HANDLERS[vcs][method] = f
return f

return decorate


Expand Down
Binary file removed alfabet/model_files/20201012_bond_embedding_nbrs.p.z
Binary file not shown.
Binary file not shown.
Binary file removed alfabet/model_files/best_model.hdf5
Binary file not shown.
386 changes: 0 additions & 386 deletions alfabet/model_files/preprocessor.json

This file was deleted.

24 changes: 10 additions & 14 deletions alfabet/neighbors.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,27 @@
import os

import joblib
import numpy as np
import tensorflow as tf
from pooch import retrieve

from alfabet import _model_files_baseurl
from alfabet.drawing import draw_bde
from alfabet.prediction import preprocessor, model, bde_dft

currdir = os.path.dirname(os.path.abspath(__file__))
embedding_model = tf.keras.Model(model.inputs, [model.layers[17].output])
embedding_model = tf.keras.Model(model.inputs, [model.layers[31].input])

nbrs_pipe = joblib.load(
os.path.join(currdir, 'model_files/20201012_bond_embedding_nbrs.p.z'))
nbrs_pipe = joblib.load(retrieve(
_model_files_baseurl + 'bond_embedding_nbrs.p.z',
known_hash='9771cf104a8f6132edc51554d69d256e6f974bcad2c6d8a3e49582dcfaf809b3'))


def pipe_kneighbors(pipe, X):
Xt = pipe.steps[0][-1].transform(X)
return pipe.steps[-1][-1].kneighbors(Xt)


def find_neighbor_bonds(smiles, bond_index, draw=True):
ds = tf.data.Dataset.from_generator(
lambda: (preprocessor.construct_feature_matrices(item, train=False)
for item in (smiles,)),
output_types=preprocessor.output_types,
output_shapes=preprocessor.output_shapes).batch(batch_size=1)

embeddings = embedding_model.predict(ds)
def find_neighbor_bonds(smiles, bond_index, draw=False):
inputs = preprocessor.construct_feature_matrices(smiles, train=False)
embeddings = embedding_model([tf.constant(np.expand_dims(val, 0), name=val) for key, val in inputs.items()])
distances, indices = pipe_kneighbors(nbrs_pipe, embeddings[:, bond_index, :])

neighbor_df = bde_dft.dropna().iloc[indices.flatten()]
Expand Down
75 changes: 16 additions & 59 deletions alfabet/prediction.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,28 @@
import os
import warnings

import nfp
import numpy as np
import pandas as pd
import pooch
import tensorflow as tf
from pooch import retrieve
from rdkit import RDLogger

from alfabet import _model_files_baseurl
from alfabet.drawing import draw_bde
from alfabet.fragment import fragment_iterator
from alfabet.preprocessor import preprocessor

RDLogger.DisableLog('rdApp.*')

currdir = os.path.dirname(os.path.abspath(__file__))
model_files = retrieve(_model_files_baseurl + 'model.tar.gz',
known_hash='f1c2b9436f2d18c76b45d95140e6a08c096250bd5f3e2b412492ca27ab38ad0c',
processor=pooch.Untar(extract_dir='model'))

model = tf.keras.models.load_model(os.path.dirname(model_files[0]))

def atom_featurizer(atom):
""" Return an integer hash representing the atom type
"""

return str((
atom.GetSymbol(),
atom.GetNumRadicalElectrons(),
atom.GetFormalCharge(),
atom.GetChiralTag(),
atom.GetIsAromatic(),
nfp.get_ring_size(atom, max_size=6),
atom.GetDegree(),
atom.GetTotalNumHs(includeNeighbors=True)
))


def bond_featurizer(bond, flipped=False):
if not flipped:
atoms = "{}-{}".format(
*tuple((bond.GetBeginAtom().GetSymbol(),
bond.GetEndAtom().GetSymbol())))
else:
atoms = "{}-{}".format(
*tuple((bond.GetEndAtom().GetSymbol(),
bond.GetBeginAtom().GetSymbol())))

btype = str(bond.GetBondType())
ring = 'R{}'.format(nfp.get_ring_size(bond, max_size=6)) if bond.IsInRing() else ''

return " ".join([atoms, btype, ring]).strip()


preprocessor = nfp.SmilesPreprocessor(
atom_features=atom_featurizer, bond_features=bond_featurizer)

preprocessor.from_json(os.path.join(currdir, 'model_files/preprocessor.json'))

with warnings.catch_warnings():
warnings.simplefilter('ignore')
model = tf.keras.models.load_model(
os.path.join(currdir, 'model_files/best_model.hdf5'),
custom_objects=nfp.custom_objects,
compile=False)

bde_dft = pd.read_csv(os.path.join(
currdir, 'model_files/20201012_bonds_for_neighbors.csv.gz'))
bde_dft = pd.read_csv(retrieve(
_model_files_baseurl + 'bonds_for_neighbors.csv.gz',
known_hash='96556a0d05daa2984059b1e1d9e303ea1946f2035f1345288a4698adde54e4e9'))


def check_input(smiles):
Expand All @@ -76,7 +38,7 @@ def check_input(smiles):

missing_bond = np.array(
list(set(iinput['bond_indices'][iinput['bond'] == 1])))
missing_atom = np.arange(iinput['n_atom'])[iinput['atom'] == 1]
missing_atom = np.arange(len(iinput['atom']))[iinput['atom'] == 1]

is_outlier = (missing_bond.size != 0) | (missing_atom.size != 0)

Expand All @@ -88,18 +50,13 @@ def predict_bdes(smiles, draw=False):
# valid
frag_df = pd.DataFrame(fragment_iterator(smiles))

ds = tf.data.Dataset.from_generator(
lambda: (preprocessor.construct_feature_matrices(item, train=False)
for item in (smiles,)),
output_types=preprocessor.output_types,
output_shapes=preprocessor.output_shapes).batch(batch_size=1)

bde_pred, bdfe_pred = model.predict(ds)
inputs = preprocessor.construct_feature_matrices(smiles, train=False)
bde_pred, bdfe_pred = model([tf.constant(np.expand_dims(val, 0), name=val) for key, val in inputs.items()])

# Reindex predictions to fragment dataframe
frag_df['bde_pred'] = pd.Series(bde_pred.squeeze()) \
frag_df['bde_pred'] = pd.Series(bde_pred.numpy().squeeze()) \
.reindex(frag_df.bond_index).reset_index(drop=True)
frag_df['bdfe_pred'] = pd.Series(bdfe_pred.squeeze()) \
frag_df['bdfe_pred'] = pd.Series(bdfe_pred.numpy().squeeze()) \
.reindex(frag_df.bond_index).reset_index(drop=True)

# Add DFT calculated bdes
Expand Down
50 changes: 50 additions & 0 deletions alfabet/preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import nfp
from pooch import retrieve

from alfabet import _model_files_baseurl


def atom_featurizer(atom):
""" Return an integer hash representing the atom type
"""

return str((
atom.GetSymbol(),
atom.GetNumRadicalElectrons(),
atom.GetFormalCharge(),
atom.GetChiralTag(),
atom.GetIsAromatic(),
nfp.get_ring_size(atom, max_size=6),
atom.GetDegree(),
atom.GetTotalNumHs(includeNeighbors=True)
))


def bond_featurizer(bond, flipped=False):
if not flipped:
atoms = "{}-{}".format(
*tuple((bond.GetBeginAtom().GetSymbol(),
bond.GetEndAtom().GetSymbol())))
else:
atoms = "{}-{}".format(
*tuple((bond.GetEndAtom().GetSymbol(),
bond.GetBeginAtom().GetSymbol())))

btype = str((bond.GetBondType(),
bond.GetIsConjugated()))
ring = 'R{}'.format(nfp.get_ring_size(bond, max_size=6)) if bond.IsInRing() else ''

return " ".join([atoms, btype, ring]).strip()


preprocessor = nfp.SmilesPreprocessor(
atom_features=atom_featurizer,
bond_features=bond_featurizer,
explicit_hs=True,
bond_indices=True,
output_dtype='int64'
)

preprocessor.from_json(retrieve(
_model_files_baseurl + 'preprocessor.json',
known_hash='412d15ca4d0e8b5030e9b497f566566922818ff355b8ee677a91dd23696878ac'))
2 changes: 2 additions & 0 deletions etc/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@ dependencies:
- pytest
- pandas
- tqdm
- joblib
- scikit-learn
- numpy=1.19.*
- tensorflow
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
],

packages=find_packages(exclude=['docs', 'tests']), # Required
install_requires=['pandas', 'nfp==0.1.4', 'tqdm'],
install_requires=['pandas', 'nfp==0.3.3', 'tqdm', 'pooch', 'joblib', 'scikit-learn'],

project_urls={
'Source': 'https://github.com/NREL/alfabet',
Expand Down
Empty file added tests/__init__.py
Empty file.
15 changes: 15 additions & 0 deletions tests/test_neighbors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import rdkit.Chem

from alfabet.neighbors import find_neighbor_bonds


def test_find_neighbor_bonds():
neighbor_df = find_neighbor_bonds('CC', 0)
assert neighbor_df.distance.min() < 1E-3 # bond should be in the database

for _, row in neighbor_df.iterrows():
mol = rdkit.Chem.AddHs(rdkit.Chem.MolFromSmiles(row.molecule))
bond = mol.GetBondWithIdx(row.bond_index)
assert bond.GetEndAtom().GetSymbol() == 'C'
assert bond.GetBeginAtom().GetSymbol() == 'C'
assert bond.GetBondType() == rdkit.Chem.rdchem.BondType.SINGLE

0 comments on commit c466a1b

Please sign in to comment.