REINVENT v3.2

MolecularAI · Jun 1, 2022 · b7324d2 · b7324d2
1 parent e2463b8
commit b7324d2
Show file tree

Hide file tree

Showing 102 changed files with 1,973 additions and 835 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,34 @@
+FROM docker.io/continuumio/conda-ci-linux-64-python3.7:latest
+
+USER root
+
+RUN apt-get update && \
+    apt-get -y install rsync procps && \
+    wget https://sourceforge.net/projects/lmod/files/lua-5.1.4.9.tar.bz2 && \
+    tar xf lua-5.1.4.9.tar.bz2 && \
+    cd lua-5.1.4.9 && \
+    ./configure --prefix=/opt/apps/lua/5.1.4.9  && \
+    make; make install && \
+    cd /opt/apps/lua; ln -s 5.1.4.9 lua && \
+    ln -s /opt/apps/lua/lua/bin/lua /usr/local/bin && \
+    ln -s /opt/apps/lua/lua/bin/luac /usr/local/bin && \
+    cd; wget https://sourceforge.net/projects/lmod/files/Lmod-8.2.tar.bz2 && \
+    tar xf Lmod-8.2.tar.bz2 && \
+    cd Lmod-8.2; ./configure --prefix=/opt/apps --with-fastTCLInterp=no && \
+    make install && \
+    ln -s /opt/apps/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh
+
+ENV LMOD_ROOT=/opt/apps/lmod \
+    LMOD_PKG=/opt/apps/lmod/lmod \
+    LMOD_VERSION=8.2 \
+    LMOD_CMD=/opt/apps/lmod/lmod/libexec/lmod \
+    LMOD_DIR=/opt/apps/lmod/lmod/libexec \
+    BASH_ENV=/opt/apps/lmod/lmod/init/bash
+
+COPY . /reinventcli/
+
+WORKDIR /reinventcli
+
+RUN conda update -n base -c defaults conda && \
+    conda env update --name=base --file=reinvent.yml && \
+    chmod -R "a+rx" /reinventcli
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-REINVENT 3.1
+REINVENT 3.2
 =================================================================================================================
 
 Installation
@@ -12,7 +12,7 @@ Installation
 
 4. Activate the environment:
 
-        $ conda activate reinvent.v3.0
+        $ conda activate reinvent.v3.2
 
 5. Use the tool.
 

diff --git a/configs/example.config.json b/configs/example.config.json
@@ -18,6 +18,13 @@
       "AZGARD_EXECUTOR_SCRIPT_PATH": "/<your_path>/executor.py",
       "AZGARD_ENV_PATH": "/<your_path>/miniconda3/envs/AZgard/bin/python",
       "AZGARD_DEBUG": true
+    },
+    "ICOLOS": {
+      "ICOLOS_EXECUTOR_PATH": "/<your_path>/miniconda3/envs/icolosprod/bin/icolos",
+      "ICOLOS_DEBUG": true
+    },
+    "AIZYNTH": {
+      "CONFIG": "/projects/mai/synthesisplanning/minimal_config.yml"
     }
   },
   "ENVIRONMENTAL_VARIABLES": {
@@ -28,5 +35,7 @@
   "ACTIVITY_CLASSIFICATION": "",
   "SMILES_SET_PATH": "",
   "PRIOR_PATH": "",
-  "LIBINVENT_PRIOR_PATH": ""
+  "LIBINVENT_PRIOR_PATH": "",
+  "SMILES_SET_LINK_INVENT_PATH":"",
+  "LINK_INVENT_PRIOR_PATH": ""
 }
diff --git a/input.py b/input.py
@@ -3,19 +3,38 @@
 
 import sys
 import json
+import argparse
+from pathlib import Path
 from running_modes.manager import Manager
 
 
-if __name__ == "__main__":
+DEFAULT_BASE_CONFIG_PATH = (Path(__file__).parent / 'configs/config.json').resolve()
+
+parser = argparse.ArgumentParser(description='Run Reinvent.')
+parser.add_argument(
+    '--base_config', type=str, default=DEFAULT_BASE_CONFIG_PATH,
+    help='Path to basic configuration for Reinvent environment.'
+)
+parser.add_argument(
+    'run_config', type=str,
+    help='Path to configuration json file for this run.'
+)
 
-    with open(sys.argv[1]) as f:
-        json_input = f.read().replace('\r', '').replace('\n', '')
 
-    configuration = {}
+def read_json_file(path):
+    with open(path) as f:
+        json_input = f.read().replace('\r', '').replace('\n', '')
     try:
-        configuration = json.loads(json_input)
-    except (ValueError, KeyError, TypeError):
-        print("JSON format error")
-    else:
-        manager = Manager(configuration)
-        manager.run()
+        return json.loads(json_input)
+    except (ValueError, KeyError, TypeError) as e:
+        print(f"JSON format error in file ${path}: \n ${e}")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = read_json_file(args.base_config)
+    run_config = read_json_file(args.run_config)
+
+    manager = Manager(base_config, run_config)
+    manager.run()
diff --git a/pytest.ini b/pytest.ini
diff --git a/reinvent.yml b/reinvent.yml
@@ -1,4 +1,4 @@
-name: reinvent.v3.0
+name: reinvent.v3.2
 channels:
   - rdkit
   - pytorch
@@ -211,9 +211,9 @@ dependencies:
     - markdown==3.2.1
     - opt-einsum==3.2.0
     - protobuf==3.11.3
-    - reinvent-chemistry==0.0.40
-    - reinvent-models==0.0.12
-    - reinvent-scoring==0.0.57
+    - reinvent-chemistry==0.0.50
+    - reinvent-models==0.0.15rc1
+    - reinvent-scoring==0.0.73
     - tensorboard==1.15.0
     - tensorflow==1.15.2
     - tensorflow-estimator==1.15.1

diff --git a/running_modes/.directory b/running_modes/.directory
@@ -0,0 +1,3 @@
+[Dolphin]
+Timestamp=2022,4,8,15,57,33
+Version=3
diff --git a/running_modes/automated_curriculum_learning/actions/__init__.py b/running_modes/automated_curriculum_learning/actions/__init__.py
@@ -0,0 +1,4 @@
+from running_modes.automated_curriculum_learning.actions.base_action import BaseAction
+from running_modes.automated_curriculum_learning.actions.base_sample_action import BaseSampleAction
+from running_modes.automated_curriculum_learning.actions.lib_invent_sample_model import LibInventSampleModel
+from running_modes.automated_curriculum_learning.actions.link_invent_sample_model import LinkInventSampleModel
diff --git a/running_modes/automated_curriculum_learning/actions/base_action.py b/running_modes/automated_curriculum_learning/actions/base_action.py
@@ -0,0 +1,11 @@
+import abc
+from running_modes.automated_curriculum_learning.logging.base_logger import BaseLogger
+
+
+class BaseAction(abc.ABC):
+    def __init__(self, logger=None):
+        """
+        (Abstract) Initializes an action.
+        :param logger: An optional logger instance.
+        """
+        self.logger: BaseLogger = logger
diff --git a/running_modes/automated_curriculum_learning/actions/base_sample_action.py b/running_modes/automated_curriculum_learning/actions/base_sample_action.py
@@ -0,0 +1,11 @@
+import numpy as np
+from running_modes.automated_curriculum_learning.actions import BaseAction
+
+
+class BaseSampleAction(BaseAction):
+
+    def _get_indices_of_unique_smiles(self, smiles: [str]) -> np.array:
+        """Returns an np.array of indices corresponding to the first entries in a list of smiles strings"""
+        _, idxs = np.unique(smiles, return_index=True)
+        sorted_indices = np.sort(idxs)
+        return sorted_indices
diff --git a/running_modes/automated_curriculum_learning/actions/lib_invent_sample_model.py b/running_modes/automated_curriculum_learning/actions/lib_invent_sample_model.py
@@ -0,0 +1,68 @@
+from typing import List
+
+import numpy as np
+from reinvent_chemistry import Conversions
+from reinvent_chemistry.library_design import BondMaker, AttachmentPoints
+from reinvent_models.lib_invent.models.dataset import Dataset
+from reinvent_models.model_factory.generative_model_base import GenerativeModelBase
+from torch.utils.data import DataLoader
+
+from running_modes.automated_curriculum_learning.actions import BaseSampleAction
+from running_modes.automated_curriculum_learning.dto.sampled_sequences_dto import SampledSequencesDTO
+
+
+class LibInventSampleModel(BaseSampleAction):
+
+    def __init__(self, model: GenerativeModelBase, batch_size: int, logger=None, randomize=False, sample_uniquely=True):
+        """
+        Creates an instance of SampleModel.
+        :params model: A model instance (better in scaffold_decorating mode).
+        :params batch_size: Batch size to use.
+        :return:
+        """
+        super().__init__(logger)
+        self.model = model
+        self._batch_size = batch_size
+        self._bond_maker = BondMaker()
+        self._attachment_points = AttachmentPoints()
+        self._randomize = randomize
+        self._conversions = Conversions()
+        self._sample_uniquely = sample_uniquely
+
+    def run(self, scaffold_list: List[str]) -> List[SampledSequencesDTO]:
+        """
+        Samples the model for the given number of SMILES.
+        :params scaffold_list: A list of scaffold SMILES.
+        :return: A list of SampledSequencesDTO.
+        """
+        scaffold_list = self._randomize_scaffolds(scaffold_list) if self._randomize else scaffold_list
+        clean_scaffolds = [self._attachment_points.remove_attachment_point_numbers(scaffold) for scaffold in scaffold_list]
+        dataset = Dataset(clean_scaffolds, self.model.get_vocabulary().scaffold_vocabulary,
+                             self.model.get_vocabulary().scaffold_tokenizer)
+        dataloader = DataLoader(dataset, batch_size=len(dataset), shuffle=False, collate_fn=Dataset.collate_fn)
+
+        for batch in dataloader:
+            sampled_sequences = []
+
+            for _ in range(self._batch_size):
+                scaffold_seqs, scaffold_seq_lengths = batch
+                packed = self.model.sample(scaffold_seqs, scaffold_seq_lengths)
+                for scaffold, decoration, nll in packed:
+                    sampled_sequences.append(SampledSequencesDTO(scaffold, decoration, nll))
+
+            if self._sample_uniquely:
+                sampled_sequences = self._sample_unique_sequences(sampled_sequences)
+
+            return sampled_sequences
+
+    def _sample_unique_sequences(self, sampled_sequences: List[SampledSequencesDTO]) -> List[SampledSequencesDTO]:
+        strings = ["".join([ss.input, ss.output]) for index, ss in enumerate(sampled_sequences)]
+        unique_idxs = self._get_indices_of_unique_smiles(strings)
+        sampled_sequences_np = np.array(sampled_sequences)
+        unique_sampled_sequences = sampled_sequences_np[unique_idxs]
+        return unique_sampled_sequences.tolist()
+
+    def _randomize_scaffolds(self, scaffolds: List[str]):
+        scaffold_mols = [self._conversions.smile_to_mol(scaffold) for scaffold in scaffolds]
+        randomized = [self._bond_maker.randomize_scaffold(mol) for mol in scaffold_mols]
+        return randomized
diff --git a/running_modes/automated_curriculum_learning/actions/link_invent_sample_model.py b/running_modes/automated_curriculum_learning/actions/link_invent_sample_model.py
@@ -0,0 +1,72 @@
+from typing import List
+
+import numpy as np
+from reinvent_chemistry import Conversions, TransformationTokens
+from reinvent_chemistry.library_design import BondMaker, AttachmentPoints
+from reinvent_models.link_invent.dataset.dataset import Dataset
+from reinvent_models.model_factory.generative_model_base import GenerativeModelBase
+from torch.utils.data import DataLoader
+
+from running_modes.automated_curriculum_learning.actions import BaseSampleAction
+from running_modes.automated_curriculum_learning.dto.sampled_sequences_dto import SampledSequencesDTO
+
+
+class LinkInventSampleModel(BaseSampleAction):
+    def __init__(self, model: GenerativeModelBase, batch_size: int, logger=None, randomize=False, sample_uniquely=True):
+        """
+        Creates an instance of SampleModel.
+        :params model: A model instance.
+        :params batch_size: Batch size to use.
+        :return:
+        """
+        super().__init__(logger)
+        self.model = model
+        self._batch_size = batch_size
+        self._bond_maker = BondMaker()
+        self._randomize = randomize
+        self._sample_uniquely = sample_uniquely
+
+        self._conversions = Conversions()
+        self._attachment_points = AttachmentPoints()
+        self._tokens = TransformationTokens()
+
+    def run(self, warheads_list: List[str]) -> List[SampledSequencesDTO]:
+        """
+        Samples the model for the given number of SMILES.
+        :params warheads_list: A list of warhead pair SMILES.
+        :return: A list of SampledSequencesDTO.
+        """
+        warheads_list = self._randomize_warheads(warheads_list) if self._randomize else warheads_list
+        clean_warheads = [self._attachment_points.remove_attachment_point_numbers(warheads) for warheads in warheads_list]
+        dataset = Dataset(clean_warheads, self.model.get_vocabulary().input)
+        data_loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False, collate_fn=dataset.collate_fn)
+
+        for batch in data_loader:
+            sampled_sequences = []
+            for _ in range(self._batch_size):
+                sampled_sequences.extend(self.model.sample(*batch))
+
+            if self._sample_uniquely:
+                sampled_sequences = self._sample_unique_sequences(sampled_sequences)
+
+            return sampled_sequences
+
+    def _sample_unique_sequences(self, sampled_sequences: List[SampledSequencesDTO]) -> List[SampledSequencesDTO]:
+        # TODO could be part of a base sample action as it is the same for link and lib invent
+        strings = ["".join([ss.input, ss.output]) for index, ss in enumerate(sampled_sequences)]
+        unique_idxs = self._get_indices_of_unique_smiles(strings)
+        sampled_sequences_np = np.array(sampled_sequences)
+        unique_sampled_sequences = sampled_sequences_np[unique_idxs]
+        return unique_sampled_sequences.tolist()
+
+    def _randomize_warheads(self, warhead_pair_list: List[str]):
+        randomized_warhead_pair_list = []
+        for warhead_pair in warhead_pair_list:
+            warhead_list = warhead_pair.split(self._tokens.ATTACHMENT_SEPARATOR_TOKEN)
+            warhead_mol_list = [self._conversions.smile_to_mol(warhead) for warhead in warhead_list]
+            warhead_randomized_list = [self._conversions.mol_to_random_smiles(mol) for mol in warhead_mol_list]
+            # Note do not use self.self._bond_maker.randomize_scaffold, as it would add unwanted brackets to the
+            # attachment points (which are not part of the warhead vocabulary)
+            warhead_pair_randomized = self._tokens.ATTACHMENT_SEPARATOR_TOKEN.join(warhead_randomized_list)
+            randomized_warhead_pair_list.append(warhead_pair_randomized)
+        return randomized_warhead_pair_list
diff --git a/running_modes/automated_curriculum_learning/actions/reinvent_sample_model.py b/running_modes/automated_curriculum_learning/actions/reinvent_sample_model.py
@@ -0,0 +1,38 @@
+from typing import Tuple, Any
+
+import numpy as np
+from reinvent_chemistry import Conversions
+from reinvent_models.model_factory.generative_model_base import GenerativeModelBase
+
+from running_modes.automated_curriculum_learning.actions import BaseSampleAction
+from running_modes.automated_curriculum_learning.dto import SampledBatchDTO
+
+
+class ReinventSampleModel(BaseSampleAction):
+    def __init__(self, model: GenerativeModelBase, batch_size: int, logger=None):
+        """
+        Creates an instance of SampleModel.
+        :params model: A model instance.
+        :params batch_size: Batch size to use.
+        :return:
+        """
+        super().__init__(logger)
+        self.model = model
+        self._batch_size = batch_size
+
+        self._conversions = Conversions()
+
+    def run(self) -> SampledBatchDTO:
+        seqs, smiles, agent_likelihood = self._sample_unique_sequences(self.model, self._batch_size)
+        batch = SampledBatchDTO(seqs, smiles, agent_likelihood)
+
+        return batch
+
+    def _sample_unique_sequences(self, agent: GenerativeModelBase, batch_size: int) -> Tuple[Any, Any, Any]:
+        seqs, smiles, agent_likelihood = agent.sample(batch_size)
+        unique_idxs = self._get_indices_of_unique_smiles(smiles)
+        seqs_unique = seqs[unique_idxs]
+        smiles_np = np.array(smiles)
+        smiles_unique = smiles_np[unique_idxs]
+        agent_likelihood_unique = agent_likelihood[unique_idxs]
+        return seqs_unique, smiles_unique, agent_likelihood_unique