diff --git a/MANIFEST.in b/MANIFEST.in index c900892..27ae70d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ -include alfabet/model_files/* include versioneer.py include alfabet/_version.py diff --git a/alfabet/__init__.py b/alfabet/__init__.py index ecd3379..be5f91d 100644 --- a/alfabet/__init__.py +++ b/alfabet/__init__.py @@ -1,3 +1,6 @@ - from . import _version + __version__ = _version.get_versions()['version'] + +_model_tag = 'v0.1' # Tag on https://github.com/pstjohn/alfabet-models/ +_model_files_baseurl = f'https://github.com/pstjohn/alfabet-models/releases/download/{_model_tag}/' diff --git a/alfabet/_version.py b/alfabet/_version.py index 92ecfca..d74c37c 100644 --- a/alfabet/_version.py +++ b/alfabet/_version.py @@ -1,4 +1,3 @@ - # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -58,12 +57,14 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate diff --git a/alfabet/model_files/20201012_bond_embedding_nbrs.p.z b/alfabet/model_files/20201012_bond_embedding_nbrs.p.z deleted file mode 100644 index bfa14be..0000000 Binary files a/alfabet/model_files/20201012_bond_embedding_nbrs.p.z and /dev/null differ diff --git a/alfabet/model_files/20201012_bonds_for_neighbors.csv.gz b/alfabet/model_files/20201012_bonds_for_neighbors.csv.gz deleted file mode 100644 index 7fe7fba..0000000 Binary files a/alfabet/model_files/20201012_bonds_for_neighbors.csv.gz and /dev/null differ diff --git a/alfabet/model_files/best_model.hdf5 b/alfabet/model_files/best_model.hdf5 deleted file mode 100644 index 26ab1e2..0000000 Binary files a/alfabet/model_files/best_model.hdf5 and /dev/null differ diff --git a/alfabet/model_files/preprocessor.json b/alfabet/model_files/preprocessor.json deleted file mode 100644 index 38ed50a..0000000 --- a/alfabet/model_files/preprocessor.json +++ /dev/null @@ -1,386 +0,0 @@ -{ - "atom_tokenizer": { - "_data": { - "unk": 1, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 3)": 2, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 0)": 3, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 4, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 0, 4, 1)": 5, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 1)": 6, - "('H', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 7, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 0)": 8, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 4, 2)": 9, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 2)": 10, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 2)": 11, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 3, 0)": 12, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 2, 0)": 13, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 3, 1)": 14, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 3, 1)": 15, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 0)": 16, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 3, 0)": 17, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 3, 1)": 18, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 1)": 19, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 1)": 20, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 2)": 21, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 4, 2)": 22, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 2, 0)": 23, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 3, 4, 1)": 24, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 4, 2)": 25, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 2, 0)": 26, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 1)": 27, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 0)": 28, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 4, 1)": 29, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 0, 4, 0)": 30, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 0)": 31, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 1)": 32, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 'max', 4, 1)": 33, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 0)": 34, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 0)": 35, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 0)": 36, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 1)": 37, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 5, 4, 1)": 38, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 4, 1)": 39, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 1)": 40, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 4, 1)": 41, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 4, 2)": 42, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 1)": 43, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 0)": 44, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 2, 0)": 45, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 4, 0)": 46, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 4, 1)": 47, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 4, 0)": 48, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 0, 4, 1)": 49, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 0)": 50, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 51, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 2, 0)": 52, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 2, 0)": 53, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 5, 4, 1)": 54, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 3, 4, 0)": 55, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 4, 4, 1)": 56, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 3, 1)": 57, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 1)": 58, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 5, 4, 0)": 59, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 2, 0)": 60, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 2, 0)": 61, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 1)": 62, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 3, 0)": 63, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 3, 1)": 64, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 4, 0)": 65, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 4, 0)": 66, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 2, 0)": 67, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 3, 0)": 68, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 3, 4, 0)": 69, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 2, 0)": 70, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 4, 4, 1)": 71, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 3, 1)": 72, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 3, 1)": 73, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 3, 0)": 74, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 3, 4, 1)": 75, - "('Cl', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 76, - "('F', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 77, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 2, 0)": 78, - "('N', 0, -1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 79, - "('N', 0, 1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 0)": 80, - "('N', 0, 1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 0)": 81, - "('O', 0, -1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 82, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 4, 0)": 83, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 0)": 84, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 85, - "('Br', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 86, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 0)": 87, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 2, 0)": 88, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 2, 0)": 89, - "('C', 0, -1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 3, 0)": 90, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 'max', 4, 1)": 91, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 1)": 92, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 0, 3, 0)": 93, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 0, 3, 0)": 94, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 2, 0)": 95, - "('N', 0, 1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 3, 0)": 96, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 4, 0)": 97, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 3, 0)": 98, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 'max', 4, 0)": 99, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 3, 0)": 100, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 4, 3, 0)": 101, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 3, 0)": 102, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 4, 0)": 103, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 0)": 104, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 1)": 105, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 3, 1)": 106, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 4, 4, 0)": 107, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 0, 4, 0)": 108, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 5, 4, 0)": 109, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 'max', 4, 0)": 110, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 5, 3, 0)": 111, - "('C', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 0)": 112, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 0, 4, 0)": 113, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 4, 4, 0)": 114, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 2, 0)": 115, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 0, 4, 0)": 116, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 2, 0)": 117, - "('C', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 0)": 118, - "('C', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 1)": 119, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 0, 4, 1)": 120, - "('C', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 0)": 121, - "('C', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 2)": 122, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 3, 3, 0)": 123, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 5, 4, 0)": 124, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 5, 4, 0)": 125, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 'max', 3, 0)": 126, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 2)": 127, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 2, 0)": 128, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 2, 0)": 129, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 5, 3, 0)": 130, - "('N', 0, 1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 3, 0)": 131, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 4, 0)": 132, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 2)": 133, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 3)": 134, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 0)": 135, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 'max', 3, 0)": 136, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 0, 4, 0)": 137, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 4)": 138, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 0)": 139, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 3, 2, 0)": 140, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 4, 1)": 141, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 3, 3, 1)": 142, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 0)": 143, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 0)": 144, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 0, 4, 0)": 145, - "('N', 0, 1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 0)": 146, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 4, 0)": 147, - "('N', 0, 1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 4, 0)": 148, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 2, 0)": 149, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 3, 0)": 150, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 0, 4, 1)": 151, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 1)": 152, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, False, 'max', 4, 0)": 153, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, False, 'max', 4, 0)": 154, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 'max', 2, 0)": 155, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 4, 1)": 156, - "('N', 0, 1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 4, 3, 0)": 157, - "('N', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 1)": 158, - "('C', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 4, 3, 1)": 159, - "('C', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 3, 1)": 160, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 1)": 161, - "('C', 0, -1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 0)": 162, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 4, 2, 0)": 163, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 4, 3, 1)": 164, - "('C', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 2, 0)": 165, - "('N', 0, 1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 4, 0)": 166, - "('C', 0, -1, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 3, 1)": 167, - "('O', 1, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 0, 1, 0)": 168, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 1)": 169, - "('N', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 3, 3, 1)": 170, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 5, 2, 0)": 171, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 4, 2, 0)": 172, - "('S', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 5, 3, 0)": 173, - "('P', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, False, 'max', 3, 0)": 174, - "('O', 0, 0, rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, True, 3, 2, 0)": 175 - }, - "num_classes": 175, - "train": false, - "unknown": [] - }, - "bond_tokenizer": { - "_data": { - "unk": 1, - "C-C SINGLE": 2, - "C-H SINGLE": 3, - "C-O DOUBLE": 4, - "O-C DOUBLE": 5, - "C-O SINGLE": 6, - "O-C SINGLE": 7, - "O-H SINGLE": 8, - "H-C SINGLE": 9, - "H-O SINGLE": 10, - "C-N SINGLE": 11, - "N-C SINGLE": 12, - "N-C SINGLE Rmax": 13, - "C-N SINGLE Rmax": 14, - "C-C SINGLE Rmax": 15, - "N-H SINGLE": 16, - "H-N SINGLE": 17, - "C-N AROMATIC R5": 18, - "N-C AROMATIC R5": 19, - "C-C AROMATIC R5": 20, - "C-C AROMATIC Rmax": 21, - "C-C DOUBLE": 22, - "C-O SINGLE R5": 23, - "C-C SINGLE R5": 24, - "O-C SINGLE R5": 25, - "C-C SINGLE R3": 26, - "C-N AROMATIC Rmax": 27, - "N-C AROMATIC Rmax": 28, - "C-C DOUBLE Rmax": 29, - "C-N DOUBLE": 30, - "N-C DOUBLE": 31, - "N-O SINGLE": 32, - "O-N SINGLE": 33, - "C-C DOUBLE R5": 34, - "C-C SINGLE R4": 35, - "C-C TRIPLE": 36, - "C-O AROMATIC R5": 37, - "O-C AROMATIC R5": 38, - "N-N SINGLE": 39, - "N-C SINGLE R5": 40, - "C-N SINGLE R5": 41, - "C-N TRIPLE": 42, - "N-C TRIPLE": 43, - "N-N AROMATIC R5": 44, - "C-O SINGLE Rmax": 45, - "C-O SINGLE R3": 46, - "O-C SINGLE R3": 47, - "O-C SINGLE Rmax": 48, - "N-O AROMATIC R5": 49, - "O-N AROMATIC R5": 50, - "C-N SINGLE R4": 51, - "N-C SINGLE R4": 52, - "C-N DOUBLE R5": 53, - "N-C DOUBLE R5": 54, - "N-N SINGLE R5": 55, - "N-O SINGLE R5": 56, - "O-N SINGLE R5": 57, - "C-C TRIPLE Rmax": 58, - "N-N DOUBLE": 59, - "N-N SINGLE R3": 60, - "N-C SINGLE R3": 61, - "C-N SINGLE R3": 62, - "C-O SINGLE R4": 63, - "O-C SINGLE R4": 64, - "C-N DOUBLE Rmax": 65, - "N-C DOUBLE Rmax": 66, - "C-C DOUBLE R4": 67, - "O-O SINGLE": 68, - "N-O DOUBLE": 69, - "O-N DOUBLE": 70, - "C-Cl SINGLE": 71, - "Cl-C SINGLE": 72, - "F-C SINGLE": 73, - "C-F SINGLE": 74, - "C-S AROMATIC R5": 75, - "S-C AROMATIC R5": 76, - "C-S SINGLE Rmax": 77, - "S-C SINGLE Rmax": 78, - "S-O DOUBLE": 79, - "O-S DOUBLE": 80, - "C-S SINGLE": 81, - "S-C SINGLE": 82, - "C-S DOUBLE": 83, - "S-C DOUBLE": 84, - "C-Br SINGLE": 85, - "Br-C SINGLE": 86, - "O-S SINGLE": 87, - "S-O SINGLE": 88, - "C-S SINGLE R5": 89, - "S-C SINGLE R5": 90, - "N-S SINGLE": 91, - "S-N SINGLE": 92, - "N-S AROMATIC R5": 93, - "S-N AROMATIC R5": 94, - "N-N DOUBLE R5": 95, - "S-H SINGLE": 96, - "H-S SINGLE": 97, - "C-S SINGLE R4": 98, - "S-C SINGLE R4": 99, - "S-N SINGLE R4": 100, - "N-S SINGLE R4": 101, - "N-N SINGLE Rmax": 102, - "N-N AROMATIC Rmax": 103, - "O-O SINGLE Rmax": 104, - "C-C AROMATIC R4": 105, - "N-S SINGLE R5": 106, - "S-N SINGLE R5": 107, - "O-P SINGLE": 108, - "P-O SINGLE": 109, - "P-O DOUBLE": 110, - "O-P DOUBLE": 111, - "C-P SINGLE": 112, - "P-C SINGLE": 113, - "P-H SINGLE": 114, - "H-P SINGLE": 115, - "N-N DOUBLE Rmax": 116, - "C-C DOUBLE R3": 117, - "O-N SINGLE Rmax": 118, - "N-O SINGLE Rmax": 119, - "O-O SINGLE R5": 120, - "C-O AROMATIC Rmax": 121, - "O-C AROMATIC Rmax": 122, - "S-N DOUBLE": 123, - "N-S DOUBLE": 124, - "N-N DOUBLE R3": 125, - "O-O SINGLE R3": 126, - "N-O SINGLE R3": 127, - "O-N SINGLE R3": 128, - "S-S DOUBLE": 129, - "C-C AROMATIC R3": 130, - "P-C SINGLE R5": 131, - "C-P SINGLE R5": 132, - "N-P SINGLE": 133, - "P-N SINGLE": 134, - "P-O SINGLE R5": 135, - "P-N SINGLE R5": 136, - "O-P SINGLE R5": 137, - "N-P SINGLE R5": 138, - "O-N SINGLE R4": 139, - "N-O SINGLE R4": 140, - "N-C DOUBLE R3": 141, - "C-N DOUBLE R3": 142, - "O-O SINGLE R4": 143, - "S-S AROMATIC R5": 144, - "Cl-N SINGLE": 145, - "N-Cl SINGLE": 146, - "N-C DOUBLE R4": 147, - "C-N DOUBLE R4": 148, - "N-N SINGLE R4": 149, - "P-O SINGLE Rmax": 150, - "O-P SINGLE Rmax": 151, - "C-S SINGLE R3": 152, - "S-C SINGLE R3": 153, - "N-P DOUBLE": 154, - "P-N DOUBLE": 155, - "O-S SINGLE Rmax": 156, - "S-O SINGLE Rmax": 157, - "N-S SINGLE Rmax": 158, - "S-N SINGLE Rmax": 159, - "S-O SINGLE R5": 160, - "O-S SINGLE R5": 161, - "P-C SINGLE Rmax": 162, - "C-P SINGLE Rmax": 163, - "N-S DOUBLE Rmax": 164, - "S-N DOUBLE Rmax": 165, - "C-S AROMATIC Rmax": 166, - "S-C AROMATIC Rmax": 167, - "N-N DOUBLE R4": 168, - "N-N AROMATIC R4": 169, - "N-Br SINGLE": 170, - "Br-N SINGLE": 171, - "C-S DOUBLE R5": 172, - "S-C DOUBLE R5": 173, - "N-N AROMATIC R3": 174, - "N-F SINGLE": 175, - "F-N SINGLE": 176, - "N-P AROMATIC R5": 177, - "P-N AROMATIC R5": 178, - "C-O AROMATIC R4": 179, - "O-C AROMATIC R4": 180, - "S-O SINGLE R4": 181, - "O-S SINGLE R4": 182, - "O-N AROMATIC Rmax": 183, - "N-O AROMATIC Rmax": 184, - "N-P SINGLE Rmax": 185, - "P-N SINGLE Rmax": 186, - "N-O AROMATIC R3": 187, - "O-N AROMATIC R3": 188, - "C-N AROMATIC R3": 189, - "N-C AROMATIC R3": 190 - }, - "num_classes": 190, - "train": false, - "unknown": [] - }, - "explicit_hs": true, - "atom_features": {}, - "bond_features": {}, - "max_atoms": 32, - "max_bonds": 31 -} \ No newline at end of file diff --git a/alfabet/neighbors.py b/alfabet/neighbors.py index 0fbdd93..029cee6 100644 --- a/alfabet/neighbors.py +++ b/alfabet/neighbors.py @@ -1,16 +1,17 @@ -import os - import joblib +import numpy as np import tensorflow as tf +from pooch import retrieve +from alfabet import _model_files_baseurl from alfabet.drawing import draw_bde from alfabet.prediction import preprocessor, model, bde_dft -currdir = os.path.dirname(os.path.abspath(__file__)) -embedding_model = tf.keras.Model(model.inputs, [model.layers[17].output]) +embedding_model = tf.keras.Model(model.inputs, [model.layers[31].input]) -nbrs_pipe = joblib.load( - os.path.join(currdir, 'model_files/20201012_bond_embedding_nbrs.p.z')) +nbrs_pipe = joblib.load(retrieve( + _model_files_baseurl + 'bond_embedding_nbrs.p.z', + known_hash='9771cf104a8f6132edc51554d69d256e6f974bcad2c6d8a3e49582dcfaf809b3')) def pipe_kneighbors(pipe, X): @@ -18,14 +19,9 @@ def pipe_kneighbors(pipe, X): return pipe.steps[-1][-1].kneighbors(Xt) -def find_neighbor_bonds(smiles, bond_index, draw=True): - ds = tf.data.Dataset.from_generator( - lambda: (preprocessor.construct_feature_matrices(item, train=False) - for item in (smiles,)), - output_types=preprocessor.output_types, - output_shapes=preprocessor.output_shapes).batch(batch_size=1) - - embeddings = embedding_model.predict(ds) +def find_neighbor_bonds(smiles, bond_index, draw=False): + inputs = preprocessor.construct_feature_matrices(smiles, train=False) + embeddings = embedding_model([tf.constant(np.expand_dims(val, 0), name=val) for key, val in inputs.items()]) distances, indices = pipe_kneighbors(nbrs_pipe, embeddings[:, bond_index, :]) neighbor_df = bde_dft.dropna().iloc[indices.flatten()] diff --git a/alfabet/prediction.py b/alfabet/prediction.py index 5551813..d5cc856 100644 --- a/alfabet/prediction.py +++ b/alfabet/prediction.py @@ -1,66 +1,28 @@ import os -import warnings -import nfp import numpy as np import pandas as pd +import pooch import tensorflow as tf +from pooch import retrieve from rdkit import RDLogger +from alfabet import _model_files_baseurl from alfabet.drawing import draw_bde from alfabet.fragment import fragment_iterator +from alfabet.preprocessor import preprocessor RDLogger.DisableLog('rdApp.*') -currdir = os.path.dirname(os.path.abspath(__file__)) +model_files = retrieve(_model_files_baseurl + 'model.tar.gz', + known_hash='f1c2b9436f2d18c76b45d95140e6a08c096250bd5f3e2b412492ca27ab38ad0c', + processor=pooch.Untar(extract_dir='model')) +model = tf.keras.models.load_model(os.path.dirname(model_files[0])) -def atom_featurizer(atom): - """ Return an integer hash representing the atom type - """ - - return str(( - atom.GetSymbol(), - atom.GetNumRadicalElectrons(), - atom.GetFormalCharge(), - atom.GetChiralTag(), - atom.GetIsAromatic(), - nfp.get_ring_size(atom, max_size=6), - atom.GetDegree(), - atom.GetTotalNumHs(includeNeighbors=True) - )) - - -def bond_featurizer(bond, flipped=False): - if not flipped: - atoms = "{}-{}".format( - *tuple((bond.GetBeginAtom().GetSymbol(), - bond.GetEndAtom().GetSymbol()))) - else: - atoms = "{}-{}".format( - *tuple((bond.GetEndAtom().GetSymbol(), - bond.GetBeginAtom().GetSymbol()))) - - btype = str(bond.GetBondType()) - ring = 'R{}'.format(nfp.get_ring_size(bond, max_size=6)) if bond.IsInRing() else '' - - return " ".join([atoms, btype, ring]).strip() - - -preprocessor = nfp.SmilesPreprocessor( - atom_features=atom_featurizer, bond_features=bond_featurizer) - -preprocessor.from_json(os.path.join(currdir, 'model_files/preprocessor.json')) - -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - model = tf.keras.models.load_model( - os.path.join(currdir, 'model_files/best_model.hdf5'), - custom_objects=nfp.custom_objects, - compile=False) - -bde_dft = pd.read_csv(os.path.join( - currdir, 'model_files/20201012_bonds_for_neighbors.csv.gz')) +bde_dft = pd.read_csv(retrieve( + _model_files_baseurl + 'bonds_for_neighbors.csv.gz', + known_hash='96556a0d05daa2984059b1e1d9e303ea1946f2035f1345288a4698adde54e4e9')) def check_input(smiles): @@ -76,7 +38,7 @@ def check_input(smiles): missing_bond = np.array( list(set(iinput['bond_indices'][iinput['bond'] == 1]))) - missing_atom = np.arange(iinput['n_atom'])[iinput['atom'] == 1] + missing_atom = np.arange(len(iinput['atom']))[iinput['atom'] == 1] is_outlier = (missing_bond.size != 0) | (missing_atom.size != 0) @@ -88,18 +50,13 @@ def predict_bdes(smiles, draw=False): # valid frag_df = pd.DataFrame(fragment_iterator(smiles)) - ds = tf.data.Dataset.from_generator( - lambda: (preprocessor.construct_feature_matrices(item, train=False) - for item in (smiles,)), - output_types=preprocessor.output_types, - output_shapes=preprocessor.output_shapes).batch(batch_size=1) - - bde_pred, bdfe_pred = model.predict(ds) + inputs = preprocessor.construct_feature_matrices(smiles, train=False) + bde_pred, bdfe_pred = model([tf.constant(np.expand_dims(val, 0), name=val) for key, val in inputs.items()]) # Reindex predictions to fragment dataframe - frag_df['bde_pred'] = pd.Series(bde_pred.squeeze()) \ + frag_df['bde_pred'] = pd.Series(bde_pred.numpy().squeeze()) \ .reindex(frag_df.bond_index).reset_index(drop=True) - frag_df['bdfe_pred'] = pd.Series(bdfe_pred.squeeze()) \ + frag_df['bdfe_pred'] = pd.Series(bdfe_pred.numpy().squeeze()) \ .reindex(frag_df.bond_index).reset_index(drop=True) # Add DFT calculated bdes diff --git a/alfabet/preprocessor.py b/alfabet/preprocessor.py new file mode 100644 index 0000000..8f25c55 --- /dev/null +++ b/alfabet/preprocessor.py @@ -0,0 +1,50 @@ +import nfp +from pooch import retrieve + +from alfabet import _model_files_baseurl + + +def atom_featurizer(atom): + """ Return an integer hash representing the atom type + """ + + return str(( + atom.GetSymbol(), + atom.GetNumRadicalElectrons(), + atom.GetFormalCharge(), + atom.GetChiralTag(), + atom.GetIsAromatic(), + nfp.get_ring_size(atom, max_size=6), + atom.GetDegree(), + atom.GetTotalNumHs(includeNeighbors=True) + )) + + +def bond_featurizer(bond, flipped=False): + if not flipped: + atoms = "{}-{}".format( + *tuple((bond.GetBeginAtom().GetSymbol(), + bond.GetEndAtom().GetSymbol()))) + else: + atoms = "{}-{}".format( + *tuple((bond.GetEndAtom().GetSymbol(), + bond.GetBeginAtom().GetSymbol()))) + + btype = str((bond.GetBondType(), + bond.GetIsConjugated())) + ring = 'R{}'.format(nfp.get_ring_size(bond, max_size=6)) if bond.IsInRing() else '' + + return " ".join([atoms, btype, ring]).strip() + + +preprocessor = nfp.SmilesPreprocessor( + atom_features=atom_featurizer, + bond_features=bond_featurizer, + explicit_hs=True, + bond_indices=True, + output_dtype='int64' +) + +preprocessor.from_json(retrieve( + _model_files_baseurl + 'preprocessor.json', + known_hash='412d15ca4d0e8b5030e9b497f566566922818ff355b8ee677a91dd23696878ac')) diff --git a/etc/environment.yml b/etc/environment.yml index e283a64..1ec1510 100644 --- a/etc/environment.yml +++ b/etc/environment.yml @@ -9,5 +9,7 @@ dependencies: - pytest - pandas - tqdm + - joblib + - scikit-learn - numpy=1.19.* - tensorflow diff --git a/setup.py b/setup.py index 612075d..669209e 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ ], packages=find_packages(exclude=['docs', 'tests']), # Required - install_requires=['pandas', 'nfp==0.1.4', 'tqdm'], + install_requires=['pandas', 'nfp==0.3.3', 'tqdm', 'pooch', 'joblib', 'scikit-learn'], project_urls={ 'Source': 'https://github.com/NREL/alfabet', diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py new file mode 100644 index 0000000..bc8ad15 --- /dev/null +++ b/tests/test_neighbors.py @@ -0,0 +1,15 @@ +import rdkit.Chem + +from alfabet.neighbors import find_neighbor_bonds + + +def test_find_neighbor_bonds(): + neighbor_df = find_neighbor_bonds('CC', 0) + assert neighbor_df.distance.min() < 1E-3 # bond should be in the database + + for _, row in neighbor_df.iterrows(): + mol = rdkit.Chem.AddHs(rdkit.Chem.MolFromSmiles(row.molecule)) + bond = mol.GetBondWithIdx(row.bond_index) + assert bond.GetEndAtom().GetSymbol() == 'C' + assert bond.GetBeginAtom().GetSymbol() == 'C' + assert bond.GetBondType() == rdkit.Chem.rdchem.BondType.SINGLE \ No newline at end of file