From 84d05e73d7f33ab13ecbe1a2e80308238c1e1308 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 6 Feb 2024 13:11:52 +0100 Subject: [PATCH 01/61] move pyg optional deps to conda --- .github/actions/install-python-and-package/action.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 6815a04b6..bb1c1b198 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -57,7 +57,10 @@ runs: ### Installing for CPU only on the CI conda install pytorch=2.1.1 torchvision=0.16.1 torchaudio=2.1.1 cpuonly=2.0.* -c pytorch conda install pyg=2.4.0 -c pyg - pip install torch_scatter==2.1.2 torch_sparse==0.6.18 torch_cluster==1.6.3 torch_spline_conv==1.2.2 -f https://data.pyg.org/whl/torch-2.1.0+cpu.html + conda install pytorch-scatter=2.1.2 -c pyg + conda install pytorch-sparse=0.6.18 -c pyg + conda install pytorch-cluster=1.6.3 -c pyg + conda install pytorch-spline-conv=1.2.2 -c pyg - name: Install dependencies on MacOS shell: bash {0} env: From fd5e74309c96d6bdf62a9c7ea975bd4d284ab738 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 6 Feb 2024 13:27:24 +0100 Subject: [PATCH 02/61] add conda installation of pip deps --- .../install-python-and-package/action.yml | 7 ++----- pyproject.toml | 18 +++++++++--------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index bb1c1b198..05e99d307 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -56,11 +56,8 @@ runs: ## PyTorch, PyG, PyG adds ### Installing for CPU only on the CI conda install pytorch=2.1.1 torchvision=0.16.1 torchaudio=2.1.1 cpuonly=2.0.* -c pytorch - conda install pyg=2.4.0 -c pyg - conda install pytorch-scatter=2.1.2 -c pyg - conda install pytorch-sparse=0.6.18 -c pyg - conda install pytorch-cluster=1.6.3 -c pyg - conda install pytorch-spline-conv=1.2.2 -c pyg + conda install pyg=2.4.0 pytorch-scatter=2.1.2 pytorch-sparse=0.6.18 pytorch-cluster=1.6.3 pytorch-spline-conv=1.2.2 -c pyg + conda install 'tables>=3.8.0' 'numpy>=1.21.5' 'scipy>=1.11.2' 'h5py>=3.6.0' 'networkx>=2.6.3' 'matplotlib>=3.5.1' 'pdb2sql>=0.5.1' 'scikit-learn>=1.0.2' 'chart-studio>=1.1.0' -c conda-forge - name: Install dependencies on MacOS shell: bash {0} env: diff --git a/pyproject.toml b/pyproject.toml index 6482eb385..2d4cd0c94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,15 +35,15 @@ classifiers = [ "Programming Language :: Python :: 3.10", ] dependencies = [ - "tables >= 3.8.0", - "numpy >= 1.21.5", - "scipy >= 1.11.2", - "h5py >= 3.6.0", - "networkx >= 2.6.3", - "matplotlib >= 3.5.1", - "pdb2sql >= 0.5.1", - "scikit-learn >= 1.0.2", - "chart-studio >= 1.1.0", + # "tables >= 3.8.0", + # "numpy >= 1.21.5", + # "scipy >= 1.11.2", + # "h5py >= 3.6.0", + # "networkx >= 2.6.3", + # "matplotlib >= 3.5.1", + # "pdb2sql >= 0.5.1", + # "scikit-learn >= 1.0.2", + # "chart-studio >= 1.1.0", "biopython >= 1.81", "python-louvain >= 0.16", "markov-clustering >= 0.0.6.dev0", From 181d6e2cf067066a5cf1a3db752e1a01bcd2162d Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 6 Feb 2024 13:32:13 +0100 Subject: [PATCH 03/61] remove pdb2sql from conda installations --- .github/actions/install-python-and-package/action.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 05e99d307..c25fef989 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -57,7 +57,7 @@ runs: ### Installing for CPU only on the CI conda install pytorch=2.1.1 torchvision=0.16.1 torchaudio=2.1.1 cpuonly=2.0.* -c pytorch conda install pyg=2.4.0 pytorch-scatter=2.1.2 pytorch-sparse=0.6.18 pytorch-cluster=1.6.3 pytorch-spline-conv=1.2.2 -c pyg - conda install 'tables>=3.8.0' 'numpy>=1.21.5' 'scipy>=1.11.2' 'h5py>=3.6.0' 'networkx>=2.6.3' 'matplotlib>=3.5.1' 'pdb2sql>=0.5.1' 'scikit-learn>=1.0.2' 'chart-studio>=1.1.0' -c conda-forge + conda install 'tables>=3.8.0' 'numpy>=1.21.5' 'scipy>=1.11.2' 'h5py>=3.6.0' 'networkx>=2.6.3' 'matplotlib>=3.5.1' 'scikit-learn>=1.0.2' 'chart-studio>=1.1.0' -c conda-forge - name: Install dependencies on MacOS shell: bash {0} env: diff --git a/pyproject.toml b/pyproject.toml index 2d4cd0c94..25c435996 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ dependencies = [ # "h5py >= 3.6.0", # "networkx >= 2.6.3", # "matplotlib >= 3.5.1", - # "pdb2sql >= 0.5.1", + "pdb2sql >= 0.5.1", # "scikit-learn >= 1.0.2", # "chart-studio >= 1.1.0", "biopython >= 1.81", From cf19db61d09908d4a4d0a8308c5e4bdb5a8b3f46 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 6 Feb 2024 13:55:45 +0100 Subject: [PATCH 04/61] finish adding conda installation for the required deps --- .../install-python-and-package/action.yml | 13 +++++++------ pyproject.toml | 18 +++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index c25fef989..069386546 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -49,15 +49,16 @@ runs: if: runner.os == 'Linux' run: | # Install deeprank2 conda dependencies - ## DSSP - conda install -c sbl dssp>=4.2.2.1 - ## MSMS - conda install -c bioconda msms>=2.6.1 - ## PyTorch, PyG, PyG adds + ## sbl channel + conda install 'dssp>=4.2.2.1' -c sbl + ## bioconda channel + conda install 'msms>=2.6.1' 'markov_clustering>=0.0.6' -c bioconda + ## pytorch and pyg channels ### Installing for CPU only on the CI conda install pytorch=2.1.1 torchvision=0.16.1 torchaudio=2.1.1 cpuonly=2.0.* -c pytorch conda install pyg=2.4.0 pytorch-scatter=2.1.2 pytorch-sparse=0.6.18 pytorch-cluster=1.6.3 pytorch-spline-conv=1.2.2 -c pyg - conda install 'tables>=3.8.0' 'numpy>=1.21.5' 'scipy>=1.11.2' 'h5py>=3.6.0' 'networkx>=2.6.3' 'matplotlib>=3.5.1' 'scikit-learn>=1.0.2' 'chart-studio>=1.1.0' -c conda-forge + ## conda-forge channel + conda install 'tables>=3.8.0' 'numpy>=1.21.5' 'scipy>=1.11.2' 'h5py>=3.6.0' 'networkx>=2.6.3' 'matplotlib>=3.5.1' 'scikit-learn>=1.0.2' 'chart-studio>=1.1.0' 'biopython>=1.81' 'python-louvain>=0.16' 'tqdm>=4.63.0' 'freesasa>=2.1.0' 'tensorboard>=0.9.0' 'protobuf>=3.20.1' 'ruff>=0.1.13' 'dill>=0.3.8' -c conda-forge - name: Install dependencies on MacOS shell: bash {0} env: diff --git a/pyproject.toml b/pyproject.toml index 25c435996..e853fc211 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,15 +44,15 @@ dependencies = [ "pdb2sql >= 0.5.1", # "scikit-learn >= 1.0.2", # "chart-studio >= 1.1.0", - "biopython >= 1.81", - "python-louvain >= 0.16", - "markov-clustering >= 0.0.6.dev0", - "tqdm >= 4.63.0", - "freesasa >= 2.1.0", - "tensorboard >= 0.9.0", - "protobuf >= 3.20.1", - "ruff >= 0.1.13", - "dill", + # "biopython >= 1.81", + # "python-louvain >= 0.16", + # "markov-clustering >= 0.0.6.dev0", + # "tqdm >= 4.63.0", + # "freesasa >= 2.1.0", + # "tensorboard >= 0.9.0", + # "protobuf >= 3.20.1", + # "ruff >= 0.1.13", + # "dill", ] [project.optional-dependencies] From 6d367d7bfd57ff86e2d825b10d47ed9c28a5ab17 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 10:05:27 +0100 Subject: [PATCH 05/61] uniform specifications in conda deps --- .github/actions/install-python-and-package/action.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 069386546..145bde74c 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -50,15 +50,15 @@ runs: run: | # Install deeprank2 conda dependencies ## sbl channel - conda install 'dssp>=4.2.2.1' -c sbl + conda install dssp>=4.2.2.1 -c sbl ## bioconda channel - conda install 'msms>=2.6.1' 'markov_clustering>=0.0.6' -c bioconda + conda install msms>=2.6.1 markov_clustering>=0.0.6 -c bioconda ## pytorch and pyg channels ### Installing for CPU only on the CI conda install pytorch=2.1.1 torchvision=0.16.1 torchaudio=2.1.1 cpuonly=2.0.* -c pytorch conda install pyg=2.4.0 pytorch-scatter=2.1.2 pytorch-sparse=0.6.18 pytorch-cluster=1.6.3 pytorch-spline-conv=1.2.2 -c pyg ## conda-forge channel - conda install 'tables>=3.8.0' 'numpy>=1.21.5' 'scipy>=1.11.2' 'h5py>=3.6.0' 'networkx>=2.6.3' 'matplotlib>=3.5.1' 'scikit-learn>=1.0.2' 'chart-studio>=1.1.0' 'biopython>=1.81' 'python-louvain>=0.16' 'tqdm>=4.63.0' 'freesasa>=2.1.0' 'tensorboard>=0.9.0' 'protobuf>=3.20.1' 'ruff>=0.1.13' 'dill>=0.3.8' -c conda-forge + conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge - name: Install dependencies on MacOS shell: bash {0} env: From ca672f12bc38e6dd2510c26719bf16ea4d1ad508 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 10:19:09 +0100 Subject: [PATCH 06/61] insert >= for pytorch and its deps --- .github/actions/install-python-and-package/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 145bde74c..c49da84b9 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -55,7 +55,7 @@ runs: conda install msms>=2.6.1 markov_clustering>=0.0.6 -c bioconda ## pytorch and pyg channels ### Installing for CPU only on the CI - conda install pytorch=2.1.1 torchvision=0.16.1 torchaudio=2.1.1 cpuonly=2.0.* -c pytorch + conda install pytorch>=2.1.1 torchvision>=0.16.1 torchaudio>=2.1.1 cpuonly>=2.0 -c pytorch conda install pyg=2.4.0 pytorch-scatter=2.1.2 pytorch-sparse=0.6.18 pytorch-cluster=1.6.3 pytorch-spline-conv=1.2.2 -c pyg ## conda-forge channel conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge From 484496fc39aa63171a461f0be1ac46a37f940003 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 10:25:28 +0100 Subject: [PATCH 07/61] fix pytorch version to 2.1.1 --- .github/actions/install-python-and-package/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index c49da84b9..de4c47fca 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -55,7 +55,7 @@ runs: conda install msms>=2.6.1 markov_clustering>=0.0.6 -c bioconda ## pytorch and pyg channels ### Installing for CPU only on the CI - conda install pytorch>=2.1.1 torchvision>=0.16.1 torchaudio>=2.1.1 cpuonly>=2.0 -c pytorch + conda install pytorch=2.1.1 torchvision>=0.16.1 torchaudio>=2.1.1 cpuonly>=2.0 -c pytorch conda install pyg=2.4.0 pytorch-scatter=2.1.2 pytorch-sparse=0.6.18 pytorch-cluster=1.6.3 pytorch-spline-conv=1.2.2 -c pyg ## conda-forge channel conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge From a02b4f86a60f6d83c3d2f8ebab4862fa715678e1 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 10:32:28 +0100 Subject: [PATCH 08/61] insert >= for pyg and its deps --- .github/actions/install-python-and-package/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index de4c47fca..6d2fd6ce0 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -56,7 +56,7 @@ runs: ## pytorch and pyg channels ### Installing for CPU only on the CI conda install pytorch=2.1.1 torchvision>=0.16.1 torchaudio>=2.1.1 cpuonly>=2.0 -c pytorch - conda install pyg=2.4.0 pytorch-scatter=2.1.2 pytorch-sparse=0.6.18 pytorch-cluster=1.6.3 pytorch-spline-conv=1.2.2 -c pyg + conda install pyg>=2.4.0 pytorch-scatter>=2.1.2 pytorch-sparse>=0.6.18 pytorch-cluster>=1.6.3 pytorch-spline-conv>=1.2.2 -c pyg ## conda-forge channel conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge - name: Install dependencies on MacOS From b2e4c279970bf0696cfca5793f1fcb80edd5af64 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 13:29:00 +0100 Subject: [PATCH 09/61] try to do conda installations with yml file --- .../install-python-and-package/action.yml | 21 +++++----- env/env2.yml | 38 +++++++++++++++++++ env/requirements2.txt | 1 + pyproject.toml | 2 +- 4 files changed, 51 insertions(+), 11 deletions(-) create mode 100644 env/env2.yml create mode 100644 env/requirements2.txt diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 6d2fd6ce0..7224e9add 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -49,16 +49,17 @@ runs: if: runner.os == 'Linux' run: | # Install deeprank2 conda dependencies - ## sbl channel - conda install dssp>=4.2.2.1 -c sbl - ## bioconda channel - conda install msms>=2.6.1 markov_clustering>=0.0.6 -c bioconda - ## pytorch and pyg channels - ### Installing for CPU only on the CI - conda install pytorch=2.1.1 torchvision>=0.16.1 torchaudio>=2.1.1 cpuonly>=2.0 -c pytorch - conda install pyg>=2.4.0 pytorch-scatter>=2.1.2 pytorch-sparse>=0.6.18 pytorch-cluster>=1.6.3 pytorch-spline-conv>=1.2.2 -c pyg - ## conda-forge channel - conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge + conda env update -f env/env2.yml + # ## sbl channel + # conda install dssp>=4.2.2.1 -c sbl + # ## bioconda channel + # conda install msms>=2.6.1 markov_clustering>=0.0.6 -c bioconda + # ## pytorch and pyg channels + # ### Installing for CPU only on the CI + # conda install pytorch=2.1.1 torchvision>=0.16.1 torchaudio>=2.1.1 cpuonly>=2.0 -c pytorch + # conda install pyg>=2.4.0 pytorch-scatter>=2.1.2 pytorch-sparse>=0.6.18 pytorch-cluster>=1.6.3 pytorch-spline-conv>=1.2.2 -c pyg + # ## conda-forge channel + # conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge - name: Install dependencies on MacOS shell: bash {0} env: diff --git a/env/env2.yml b/env/env2.yml new file mode 100644 index 000000000..0159bd684 --- /dev/null +++ b/env/env2.yml @@ -0,0 +1,38 @@ +channels: + - pytorch + - pyg + - bioconda + - defaults + - conda-forge + - sbl +dependencies: + - dssp>=4.2.2.1 + - msms>=2.6.1 + - markov_clustering>=0.0.6 + - pytorch=2.1.1 + - torchvision>=0.16.1 + - torchaudio>=2.1.1 + - cpuonly>=2.0 + - pyg>=2.4.0 + - pytorch-scatter>=2.1.2 + - pytorch-sparse>=0.6.18 + - pytorch-cluster>=1.6.3 + - pytorch-spline-conv>=1.2.2 + - tables>=3.8.0 + - numpy>=1.21.5 + - scipy>=1.11.2 + - h5py>=3.6.0 + - networkx>=2.6.3 + - matplotlib>=3.5.1 + - scikit-learn>=1.0.2 + - chart-studio>=1.1.0 + - biopython>=1.81 + - python-louvain>=0.16 + - tqdm>=4.63.0 + - freesasa>=2.1.0 + - tensorboard>=0.9.0 + - protobuf>=3.20.1 + - ruff>=0.1.13 + - dill>=0.3.8 + - pip: + - --requirement requirements2.txt diff --git a/env/requirements2.txt b/env/requirements2.txt new file mode 100644 index 000000000..e68fd71d3 --- /dev/null +++ b/env/requirements2.txt @@ -0,0 +1 @@ +pdb2sql>=0.5.1 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index e853fc211..e65c3981d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ dependencies = [ # "h5py >= 3.6.0", # "networkx >= 2.6.3", # "matplotlib >= 3.5.1", - "pdb2sql >= 0.5.1", + # "pdb2sql >= 0.5.1", # "scikit-learn >= 1.0.2", # "chart-studio >= 1.1.0", # "biopython >= 1.81", From e2c3f727845f178122f6cfa3c1d96dea4e85a8d6 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 14:16:37 +0100 Subject: [PATCH 10/61] change action to miniconda --- .../install-python-and-package/action.yml | 54 +++++++------------ .github/workflows/build-repo.yml | 8 +++ env/env2.yml | 1 + 3 files changed, 27 insertions(+), 36 deletions(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 7224e9add..8bf4be30a 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -26,30 +26,36 @@ runs: uses: styfle/cancel-workflow-action@0.4.0 with: access_token: ${{ github.token }} + - uses: actions/checkout@v3 - - name: Setup conda - uses: s-weigand/setup-conda@v1 + + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 with: - update-conda: true + auto-update-conda: true + miniforge-variant: Mambaforge + channels: conda-forge python-version: ${{ inputs.python-version }} - conda-channels: pytorch, pyg, bioconda, defaults, sbl, conda-forge + activate-environment: deeprank2 + environment-file: env/env2.yml + use-mamba: true + - run: | conda --version conda env list - shell: bash {0} + - name: Python info - shell: bash -e {0} run: | which python3 python3 --version - - name: Install dependencies on Linux - shell: bash {0} + + - name: Activate deeprank2 environment on Linux env: CMAKE_INSTALL_PREFIX: .local if: runner.os == 'Linux' run: | - # Install deeprank2 conda dependencies - conda env update -f env/env2.yml + conda activate deeprank2 + # # Install deeprank2 conda dependencies # ## sbl channel # conda install dssp>=4.2.2.1 -c sbl # ## bioconda channel @@ -60,36 +66,12 @@ runs: # conda install pyg>=2.4.0 pytorch-scatter>=2.1.2 pytorch-sparse>=0.6.18 pytorch-cluster>=1.6.3 pytorch-spline-conv>=1.2.2 -c pyg # ## conda-forge channel # conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge - - name: Install dependencies on MacOS - shell: bash {0} - env: - CMAKE_INSTALL_PREFIX: .local - if: runner.os == 'macOS' - run: | - # Install dependencies not handled by setuptools - ## DSSP - conda install -c sbl dssp>=4.2.2.1 - ## MSMS - cd /tmp/ - wget http://mgltools.scripps.edu/downloads/tars/releases/MSMSRELEASE/REL2.6.1/msms_i86Linux2_2.6.1.tar.gz - sudo mkdir /usr/local/lib/msms - cd /usr/local/lib/msms - sudo tar zxvf /tmp/msms_i86Linux2_2.6.1.tar.gz - sudo ln -s /usr/local/lib/msms/msms.i86Linux2.2.6.1 /usr/local/bin/msms - sudo ln -s /usr/local/lib/msms/pdb_to_xyzr* /usr/local/bin - ## PyTorch, PyG, PyG adds - ### Installing for CPU only on the CI - conda install pytorch torchvision torchaudio cpuonly -c pytorch - pip install torch_geometric - pip install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-$(python3 -c "import torch; print(torch.__version__)")+cpu.html - # PyTables via conda only for MacOS - conda install pytables + - name: Install the GitHub repository version of the package - shell: bash {0} if: ${{ inputs.pkg-installation-type == 'repository' }} run: pip install .'[${{ inputs.extras-require }}]' + - name: Install the latest released version of the package - shell: bash {0} if: ${{ inputs.pkg-installation-type == 'latest' }} run: | pip install pytest diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 13e2764a2..7af4cc7ae 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -39,14 +39,22 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.10"] # ["3.10", "3.11"] + # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell + defaults: + run: + shell: bash -l {0} + steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/install-python-and-package with: python-version: ${{ matrix.python-version }} extras-require: test, publishing pkg-installation-type: "repository" + - name: Run unit tests run: pytest -v + - name: Verify that we can build the package run: python3 -m build diff --git a/env/env2.yml b/env/env2.yml index 0159bd684..23dcc17a2 100644 --- a/env/env2.yml +++ b/env/env2.yml @@ -1,3 +1,4 @@ +name: deeprank2 channels: - pytorch - pyg From 489d9bc3521b3261518791dbd546b15cc1bb23e1 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 14:18:10 +0100 Subject: [PATCH 11/61] add bash property --- .github/actions/install-python-and-package/action.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 8bf4be30a..10dfbf616 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -43,13 +43,16 @@ runs: - run: | conda --version conda env list + shell: bash {0} - name: Python info + shell: bash -e {0} run: | which python3 python3 --version - name: Activate deeprank2 environment on Linux + shell: bash {0} env: CMAKE_INSTALL_PREFIX: .local if: runner.os == 'Linux' @@ -68,10 +71,11 @@ runs: # conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge - name: Install the GitHub repository version of the package + shell: bash {0} if: ${{ inputs.pkg-installation-type == 'repository' }} run: pip install .'[${{ inputs.extras-require }}]' - - name: Install the latest released version of the package + shell: bash {0} if: ${{ inputs.pkg-installation-type == 'latest' }} run: | pip install pytest From 7d4fd5ded96039c3bfe8dff777d8510105f13e57 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 14:24:24 +0100 Subject: [PATCH 12/61] add -el --- .github/actions/install-python-and-package/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 10dfbf616..421a0ea6d 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -52,7 +52,7 @@ runs: python3 --version - name: Activate deeprank2 environment on Linux - shell: bash {0} + shell: bash -el {0} env: CMAKE_INSTALL_PREFIX: .local if: runner.os == 'Linux' From edcf798d1d5adede4954b149cbd6b9d10ccf1071 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 16:56:33 +0100 Subject: [PATCH 13/61] try to fix env activation --- .github/actions/install-python-and-package/action.yml | 4 +++- .github/workflows/build-repo.yml | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 421a0ea6d..4942488c1 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -73,7 +73,9 @@ runs: - name: Install the GitHub repository version of the package shell: bash {0} if: ${{ inputs.pkg-installation-type == 'repository' }} - run: pip install .'[${{ inputs.extras-require }}]' + run: | + conda activate deeprank2 + pip install .'[${{ inputs.extras-require }}]' - name: Install the latest released version of the package shell: bash {0} if: ${{ inputs.pkg-installation-type == 'latest' }} diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 7af4cc7ae..b3b1cf3ea 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -54,7 +54,9 @@ jobs: pkg-installation-type: "repository" - name: Run unit tests - run: pytest -v + run: | + conda activate deeprank2 + pytest -v - name: Verify that we can build the package run: python3 -m build From 03461dc008b6dc7905f6ad69f5cd608afcd1e246 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 17:01:19 +0100 Subject: [PATCH 14/61] try to fix again conda env activation --- .github/actions/install-python-and-package/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 4942488c1..74f47548c 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -71,7 +71,7 @@ runs: # conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge - name: Install the GitHub repository version of the package - shell: bash {0} + shell: bash -el {0} if: ${{ inputs.pkg-installation-type == 'repository' }} run: | conda activate deeprank2 From fa4d3f8694bed6266dce1764c2708304d069f560 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Feb 2024 17:09:14 +0100 Subject: [PATCH 15/61] specify sbl channel for dssp for fixing the error --- env/env2.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/env/env2.yml b/env/env2.yml index 23dcc17a2..2f777cea6 100644 --- a/env/env2.yml +++ b/env/env2.yml @@ -7,7 +7,8 @@ channels: - conda-forge - sbl dependencies: - - dssp>=4.2.2.1 + - sbl::libcifpp>=5.1.0 + - sbl::dssp>=4.2.2.1 - msms>=2.6.1 - markov_clustering>=0.0.6 - pytorch=2.1.1 From 8f66c6df31b5a4a4233b70a44b5309dd60ace834 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 13 Feb 2024 16:50:19 +0100 Subject: [PATCH 16/61] remove deps from pyproject --- .../install-python-and-package/action.yml | 31 +++++-------------- .github/workflows/build-repo.yml | 4 +-- pyproject.toml | 20 ------------ 3 files changed, 8 insertions(+), 47 deletions(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 74f47548c..c0f807fdf 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -43,43 +43,26 @@ runs: - run: | conda --version conda env list - shell: bash {0} + shell: bash -l {0} - name: Python info - shell: bash -e {0} + shell: bash -l {0} run: | which python3 python3 --version - - name: Activate deeprank2 environment on Linux - shell: bash -el {0} - env: - CMAKE_INSTALL_PREFIX: .local - if: runner.os == 'Linux' - run: | - conda activate deeprank2 - # # Install deeprank2 conda dependencies - # ## sbl channel - # conda install dssp>=4.2.2.1 -c sbl - # ## bioconda channel - # conda install msms>=2.6.1 markov_clustering>=0.0.6 -c bioconda - # ## pytorch and pyg channels - # ### Installing for CPU only on the CI - # conda install pytorch=2.1.1 torchvision>=0.16.1 torchaudio>=2.1.1 cpuonly>=2.0 -c pytorch - # conda install pyg>=2.4.0 pytorch-scatter>=2.1.2 pytorch-sparse>=0.6.18 pytorch-cluster>=1.6.3 pytorch-spline-conv>=1.2.2 -c pyg - # ## conda-forge channel - # conda install tables>=3.8.0 numpy>=1.21.5 scipy>=1.11.2 h5py>=3.6.0 networkx>=2.6.3 matplotlib>=3.5.1 scikit-learn>=1.0.2 chart-studio>=1.1.0 biopython>=1.81 python-louvain>=0.16 tqdm>=4.63.0 freesasa>=2.1.0 tensorboard>=0.9.0 protobuf>=3.20.1 ruff>=0.1.13 dill>=0.3.8 -c conda-forge - - name: Install the GitHub repository version of the package - shell: bash -el {0} + shell: bash -l {0} if: ${{ inputs.pkg-installation-type == 'repository' }} run: | conda activate deeprank2 pip install .'[${{ inputs.extras-require }}]' - - name: Install the latest released version of the package - shell: bash {0} + + - name: Install the latest released PyPI version of the package + shell: bash -l {0} if: ${{ inputs.pkg-installation-type == 'latest' }} run: | + conda activate deeprank2 pip install pytest rm -r deeprank2 pip install deeprank2 diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index b3b1cf3ea..7af4cc7ae 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -54,9 +54,7 @@ jobs: pkg-installation-type: "repository" - name: Run unit tests - run: | - conda activate deeprank2 - pytest -v + run: pytest -v - name: Verify that we can build the package run: python3 -m build diff --git a/pyproject.toml b/pyproject.toml index e65c3981d..49df56d2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,26 +34,6 @@ classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", ] -dependencies = [ - # "tables >= 3.8.0", - # "numpy >= 1.21.5", - # "scipy >= 1.11.2", - # "h5py >= 3.6.0", - # "networkx >= 2.6.3", - # "matplotlib >= 3.5.1", - # "pdb2sql >= 0.5.1", - # "scikit-learn >= 1.0.2", - # "chart-studio >= 1.1.0", - # "biopython >= 1.81", - # "python-louvain >= 0.16", - # "markov-clustering >= 0.0.6.dev0", - # "tqdm >= 4.63.0", - # "freesasa >= 2.1.0", - # "tensorboard >= 0.9.0", - # "protobuf >= 3.20.1", - # "ruff >= 0.1.13", - # "dill", -] [project.optional-dependencies] # development dependency groups From d27cefe021633a6fd362104aca282ffa1abcd62f Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 13 Feb 2024 16:59:08 +0100 Subject: [PATCH 17/61] fix coveralls and linting actions --- .github/workflows/coveralls.yml | 5 +++++ .github/workflows/linting.yml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index 8a7632985..eb4feff2a 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -39,6 +39,11 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.10"] + # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell + defaults: + run: + shell: bash -l {0} + steps: - uses: actions/checkout@v3 - uses: ./.github/actions/install-python-and-package diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 15b26684f..cdfdeafe2 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -39,6 +39,11 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.10"] + # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell + defaults: + run: + shell: bash -l {0} + steps: - uses: actions/checkout@v3 - uses: ./.github/actions/install-python-and-package From e9a7ec4199d2f60ec325be78e8b33e935114d55c Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 15 Feb 2024 18:47:58 +0100 Subject: [PATCH 18/61] reorganize env files --- env/{env2.yml => deeprank2-py310.yml} | 6 +++++- env/environment.yml | 22 ---------------------- env/requirements.txt | 7 +------ env/requirements2.txt | 1 - 4 files changed, 6 insertions(+), 30 deletions(-) rename env/{env2.yml => deeprank2-py310.yml} (87%) delete mode 100644 env/environment.yml delete mode 100644 env/requirements2.txt diff --git a/env/env2.yml b/env/deeprank2-py310.yml similarity index 87% rename from env/env2.yml rename to env/deeprank2-py310.yml index 2f777cea6..4a2a7cb08 100644 --- a/env/env2.yml +++ b/env/deeprank2-py310.yml @@ -7,6 +7,9 @@ channels: - conda-forge - sbl dependencies: + - python==3.10 + - pip>=23.3 + - notebook>=7.0.6 - sbl::libcifpp>=5.1.0 - sbl::dssp>=4.2.2.1 - msms>=2.6.1 @@ -36,5 +39,6 @@ dependencies: - protobuf>=3.20.1 - ruff>=0.1.13 - dill>=0.3.8 + - pyarrow>=15.0.0 - pip: - - --requirement requirements2.txt + - --requirement requirements.txt diff --git a/env/environment.yml b/env/environment.yml deleted file mode 100644 index 037f735f9..000000000 --- a/env/environment.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: deeprank2 -channels: - - pytorch - - pyg - - bioconda - - defaults - - conda-forge - - sbl -dependencies: - - pip==23.3.* - - python==3.10.* - - msms==2.6.1 - - dssp>=4.2.2.1 - - pytorch==2.1.1 - - pytorch-mutex==1.0.* - - torchvision==0.16.1 - - torchaudio==2.1.1 - - cpuonly==2.0.* - - pyg==2.4.0 - - notebook==7.0.6 - - pip: - - --requirement requirements.txt diff --git a/env/requirements.txt b/env/requirements.txt index 23b468d33..e68fd71d3 100644 --- a/env/requirements.txt +++ b/env/requirements.txt @@ -1,6 +1 @@ ---find-links https://data.pyg.org/whl/torch-2.1.0+cpu.html -torch_scatter==2.1.2 -torch_sparse==0.6.18 -torch_cluster==1.6.3 -torch_spline_conv==1.2.2 -deeprank2==2.1.2 +pdb2sql>=0.5.1 \ No newline at end of file diff --git a/env/requirements2.txt b/env/requirements2.txt deleted file mode 100644 index e68fd71d3..000000000 --- a/env/requirements2.txt +++ /dev/null @@ -1 +0,0 @@ -pdb2sql>=0.5.1 \ No newline at end of file From 32825be3d05faa10d6758de14067e631ed4eda3f Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 15 Feb 2024 18:48:37 +0100 Subject: [PATCH 19/61] edit env name in package action --- .github/actions/install-python-and-package/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index c0f807fdf..635507429 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -37,7 +37,7 @@ runs: channels: conda-forge python-version: ${{ inputs.python-version }} activate-environment: deeprank2 - environment-file: env/env2.yml + environment-file: env/deeprank2-py${{ join(inputs.python-version, '.') }}.yml use-mamba: true - run: | From 7a591f89f67a68d7cea0f0fa717babe25ce246d6 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 15 Feb 2024 20:10:36 +0100 Subject: [PATCH 20/61] separate docker env files --- env/deeprank2-docker.yml | 44 ++++++++++++++++++++++ env/{deeprank2-py310.yml => deeprank2.yml} | 1 - env/requirements-docker.txt | 2 + 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 env/deeprank2-docker.yml rename env/{deeprank2-py310.yml => deeprank2.yml} (97%) create mode 100644 env/requirements-docker.txt diff --git a/env/deeprank2-docker.yml b/env/deeprank2-docker.yml new file mode 100644 index 000000000..0bb555d41 --- /dev/null +++ b/env/deeprank2-docker.yml @@ -0,0 +1,44 @@ +name: deeprank2 +channels: + - pytorch + - pyg + - bioconda + - defaults + - conda-forge + - sbl +dependencies: + - python==3.10 + - pip>=23.3 + - notebook>=7.0.6 + - sbl::libcifpp>=5.1.0 + - sbl::dssp>=4.2.2.1 + - msms>=2.6.1 + - markov_clustering>=0.0.6 + - pytorch=2.1.1 + - torchvision>=0.16.1 + - torchaudio>=2.1.1 + - cpuonly>=2.0 + - pyg>=2.4.0 + - pytorch-scatter>=2.1.2 + - pytorch-sparse>=0.6.18 + - pytorch-cluster>=1.6.3 + - pytorch-spline-conv>=1.2.2 + - tables>=3.8.0 + - numpy>=1.21.5 + - scipy>=1.11.2 + - h5py>=3.6.0 + - networkx>=2.6.3 + - matplotlib>=3.5.1 + - scikit-learn>=1.0.2 + - chart-studio>=1.1.0 + - biopython>=1.81 + - python-louvain>=0.16 + - tqdm>=4.63.0 + - freesasa>=2.1.0 + - tensorboard>=0.9.0 + - protobuf>=3.20.1 + - ruff>=0.1.13 + - dill>=0.3.8 + - pyarrow>=15.0.0 + - pip: + - --requirement requirements-docker.txt diff --git a/env/deeprank2-py310.yml b/env/deeprank2.yml similarity index 97% rename from env/deeprank2-py310.yml rename to env/deeprank2.yml index 4a2a7cb08..d307ba817 100644 --- a/env/deeprank2-py310.yml +++ b/env/deeprank2.yml @@ -7,7 +7,6 @@ channels: - conda-forge - sbl dependencies: - - python==3.10 - pip>=23.3 - notebook>=7.0.6 - sbl::libcifpp>=5.1.0 diff --git a/env/requirements-docker.txt b/env/requirements-docker.txt new file mode 100644 index 000000000..d7027d559 --- /dev/null +++ b/env/requirements-docker.txt @@ -0,0 +1,2 @@ +pdb2sql>=0.5.1 +deeprank2>=3.0.0 \ No newline at end of file From 19f83135c40b8210e8cf0733ae49f34041e89cec Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 15 Feb 2024 20:11:13 +0100 Subject: [PATCH 21/61] update docker file with new env files names --- Dockerfile | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Dockerfile b/Dockerfile index f4b52528a..73be0bb87 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,26 +3,26 @@ FROM --platform=linux/x86_64 condaforge/miniforge3:23.3.1-1 # Add files ADD ./tutorials /home/deeprank2/tutorials -ADD ./env/environment.yml /home/deeprank2 -ADD ./env/requirements.txt /home/deeprank2 +ADD ./env/deeprank2-docker.yml /home/deeprank2 +ADD ./env/requirements-docker.txt /home/deeprank2 -# Install RUN \ - apt update -y && - apt install unzip -y && + # Install dependencies and package + apt update -y && \ + apt install unzip -y && \ ## GCC - apt install -y gcc && - ## Conda and pip deps - mamba env create -f /home/deeprank2/environment.yml && - ## Get the data for running the tutorials - if [ -d "/home/deeprank2/tutorials/data_raw" ]; then rm -Rf /home/deeprank2/tutorials/data_raw; fi && - if [ -d "/home/deeprank2/tutorials/data_processed" ]; then rm -Rf /home/deeprank2/tutorials/data_processed; fi && - wget https://zenodo.org/records/8349335/files/data_raw.zip && - unzip data_raw.zip -d data_raw && + apt install -y gcc && \ + ## Create the environment and install the dependencies + mamba env create -f /home/deeprank2/deeprank2-docker.yml && \ + ## Activate the environment automatically when entering the container + echo "source activate deeprank2" >~/.bashrc && \ + # Get the data for running the tutorials + if [ -d "/home/deeprank2/tutorials/data_raw" ]; then rm -Rf /home/deeprank2/tutorials/data_raw; fi && \ + if [ -d "/home/deeprank2/tutorials/data_processed" ]; then rm -Rf /home/deeprank2/tutorials/data_processed; fi && \ + wget https://zenodo.org/records/8349335/files/data_raw.zip && \ + unzip data_raw.zip -d data_raw && \ mv data_raw /home/deeprank2/tutorials -# Activate the environment -RUN echo "source activate deeprank2" >~/.bashrc ENV PATH /opt/conda/envs/deeprank2/bin:$PATH # Define working directory From 94b4194c2774437daac32a59a854f8d3f739d71a Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 15 Feb 2024 20:11:40 +0100 Subject: [PATCH 22/61] update action.yml with the renamed env file --- .github/actions/install-python-and-package/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 635507429..9f7253360 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -37,7 +37,7 @@ runs: channels: conda-forge python-version: ${{ inputs.python-version }} activate-environment: deeprank2 - environment-file: env/deeprank2-py${{ join(inputs.python-version, '.') }}.yml + environment-file: env/deeprank2.yml use-mamba: true - run: | From 6ab611d272c058aa4e4afa2aaacbbbde3ce31af4 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 19 Feb 2024 11:35:34 +0100 Subject: [PATCH 23/61] add pdb2sql conda installation and remove the pip installation --- env/deeprank2-docker.yml | 1 + env/deeprank2.yml | 3 +-- env/requirements-docker.txt | 1 - env/requirements.txt | 1 - 4 files changed, 2 insertions(+), 4 deletions(-) delete mode 100644 env/requirements.txt diff --git a/env/deeprank2-docker.yml b/env/deeprank2-docker.yml index 0bb555d41..9346f1855 100644 --- a/env/deeprank2-docker.yml +++ b/env/deeprank2-docker.yml @@ -32,6 +32,7 @@ dependencies: - scikit-learn>=1.0.2 - chart-studio>=1.1.0 - biopython>=1.81 + - pdb2sql>=0.5.1 - python-louvain>=0.16 - tqdm>=4.63.0 - freesasa>=2.1.0 diff --git a/env/deeprank2.yml b/env/deeprank2.yml index d307ba817..e0e423054 100644 --- a/env/deeprank2.yml +++ b/env/deeprank2.yml @@ -31,6 +31,7 @@ dependencies: - scikit-learn>=1.0.2 - chart-studio>=1.1.0 - biopython>=1.81 + - pdb2sql>=0.5.1 - python-louvain>=0.16 - tqdm>=4.63.0 - freesasa>=2.1.0 @@ -39,5 +40,3 @@ dependencies: - ruff>=0.1.13 - dill>=0.3.8 - pyarrow>=15.0.0 - - pip: - - --requirement requirements.txt diff --git a/env/requirements-docker.txt b/env/requirements-docker.txt index d7027d559..f03a7051a 100644 --- a/env/requirements-docker.txt +++ b/env/requirements-docker.txt @@ -1,2 +1 @@ -pdb2sql>=0.5.1 deeprank2>=3.0.0 \ No newline at end of file diff --git a/env/requirements.txt b/env/requirements.txt deleted file mode 100644 index e68fd71d3..000000000 --- a/env/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pdb2sql>=0.5.1 \ No newline at end of file From 43fb72d0b1828fb34c03ae3887a2407fcae717c2 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 19 Feb 2024 14:02:43 +0100 Subject: [PATCH 24/61] fix error in data processing tutorials notebooks --- tutorials/data_generation_ppi.ipynb | 16 +++++----------- tutorials/data_generation_srv.ipynb | 21 ++++++++++----------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 8330e9a8b..2d1d9650a 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -255,9 +255,7 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(\n", - " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.'\n", - ")" + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" ] }, { @@ -341,7 +339,7 @@ "outputs": [], "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data)\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", "df = dataset.hdf5_to_pandas()\n", "df.head()" ] @@ -360,9 +358,7 @@ "metadata": {}, "outputs": [], "source": [ - "fname = os.path.join(\n", - " processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"])\n", - ")\n", + "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", @@ -480,9 +476,7 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(\n", - " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.'\n", - ")" + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" ] }, { @@ -500,7 +494,7 @@ "outputs": [], "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data)\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", "df = dataset.hdf5_to_pandas()\n", "df.head()" ] diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index 11a776051..1a68f31ad 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -266,9 +266,7 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(\n", - " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.'\n", - ")" + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" ] }, { @@ -359,7 +357,7 @@ "outputs": [], "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data)\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", "df = dataset.hdf5_to_pandas()\n", "df.head()" ] @@ -378,9 +376,7 @@ "metadata": {}, "outputs": [], "source": [ - "fname = os.path.join(\n", - " processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"])\n", - ")\n", + "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", @@ -500,9 +496,7 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(\n", - " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.'\n", - ")" + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" ] }, { @@ -520,7 +514,7 @@ "outputs": [], "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data)\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", "df = dataset.hdf5_to_pandas()\n", "df.head()" ] @@ -548,6 +542,11 @@ "source": [ "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation.\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { From f10336a0556d415c7ee3da3cb64a3e36b4b92cb8 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 19 Feb 2024 15:01:27 +0100 Subject: [PATCH 25/61] update docs for new installation --- README.md | 43 ++++++++++++++++++++++--------------------- docs/installation.md | 31 ++++++++++++++++--------------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index fc2bcd2bf..410b3ed00 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # DeepRank2 -| Badges | | -| :------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **fairness** | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6403/badge)](https://bestpractices.coreinfrastructure.org/projects/6403) | -| **package** | [![PyPI version](https://badge.fury.io/py/deeprank2.svg)](https://badge.fury.io/py/deeprank2) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/b1bde03fc0334e07b0cd8a69ce2adeb3)](https://app.codacy.com/gh/DeepRank/deeprank2/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) | -| **docs** | [![Documentation Status](https://readthedocs.org/projects/deeprank2/badge/?version=latest)](https://deeprank2.readthedocs.io/en/latest/?badge=latest) [![DOI](https://zenodo.org/badge/450496579.svg)](https://zenodo.org/badge/latestdoi/450496579) | +| Badges | | +| :------------: | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **fairness** | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6403/badge)](https://bestpractices.coreinfrastructure.org/projects/6403) | +| **package** | [![PyPI version](https://badge.fury.io/py/deeprank2.svg)](https://badge.fury.io/py/deeprank2) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/b1bde03fc0334e07b0cd8a69ce2adeb3)](https://app.codacy.com/gh/DeepRank/deeprank2/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) | +| **docs** | [![Documentation Status](https://readthedocs.org/projects/deeprank2/badge/?version=latest)](https://deeprank2.readthedocs.io/en/latest/?badge=latest) [![DOI](https://zenodo.org/badge/450496579.svg)](https://zenodo.org/badge/latestdoi/450496579) | | **tests** | [![Build Status](https://github.com/DeepRank/deeprank2/actions/workflows/build-repo.yml/badge.svg)](https://github.com/DeepRank/deeprank2/actions) ![Linting status](https://github.com/DeepRank/deeprank2/actions/workflows/linting.yml/badge.svg?branch=main) [![Coverage Status](https://coveralls.io/repos/github/DeepRank/deeprank2/badge.svg?branch=main)](https://coveralls.io/github/DeepRank/deeprank2?branch=main) ![Python](https://img.shields.io/badge/python-3.10-blue.svg) | -| **running on** | ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white) | -| **license** | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/license/apache-2-0/) | +| **running on** | ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white) | +| **license** | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/license/apache-2-0/) | ## Overview @@ -44,8 +44,8 @@ Main features: - [Installation](#installation) - [Containerized Installation](#containerized-installation) - [Local/remote installation](#localremote-installation) - - [YML file installation](#yml-file-installation) - - [Manual installation](#manual-installation) + - [YML file installation (recommended)](#yml-file-installation-recommended) + - [Manual installation (not recommended)](#manual-installation-not-recommended) - [Testing DeepRank2 installation](#testing-deeprank2-installation) - [Contributing](#contributing) - [Using DeepRank2](#using-deeprank2) @@ -95,7 +95,7 @@ Local installation is formally only supported on the latest stable release of ub Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -#### YML file installation +#### YML file installation (recommended) You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) containing the latest stable release of DeepRank2 and all its dependencies. This will install the CPU-only version of DeepRank2 on Python 3.10. @@ -109,27 +109,28 @@ cd deeprank2 # Ensure you are in your base environment conda activate # Create the environment -conda env create -f env/environment.yml +conda env create -f env/deeprank2.yml # Activate the environment conda activate deeprank2 +# Install the latest deeprank2 release +pip install deeprank2 ``` See instructions below to [test](#testing-deeprank2-installation) that the installation was succesful. -#### Manual installation +#### Manual installation (not recommended) -If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not succesful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Jan 2024): +If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not successful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -- [DSSP 4](https://anaconda.org/sbl/dssp): `conda install -c sbl dssp` -- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms` - - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. -- [PyTorch](https://pytorch.org/get-started/locally/): `conda install pytorch torchvision torchaudio cpuonly -c pytorch` - - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). +You can first remove from `env/deeprank2.yml` the packages that cannot be installed properly, or the ones that you want to install differently (e.g., pytorch-related packages if you wish to install the CUDA version), and then proceed with the environment creation by using the edited YML file: `conda env create -f env/deeprank2.yml`. Then activate the environment, and proceed with installing the missing packages, which might fall into the following list. If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Feb 2024): + +- [MSMS](https://anaconda.org/bioconda/msms): [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. +- [PyTorch](https://pytorch.org/get-started/locally/) + - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested on ubuntu using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). - We support torch's CPU library as well as CUDA. - [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. - The exact command to install pyg will depend on the version of pytorch you are using. Please refer to the source's installation instructions (we recommend using the pip installation for this as it also shows the command for the dependencies). -- For MacOS with M1 chip users: install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). +- [FreeSASA](https://freesasa.github.io/python/). Finally install deeprank2 itself: `pip install deeprank2`. @@ -145,7 +146,7 @@ The `test` extra is optional, and can be used to install test-related dependenci #### Testing DeepRank2 installation -You can check that all components were installed correctly, using pytest. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). +You can check that all components were installed correctly, using `pytest`. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). The quick test should be sufficient to ensure that the software works, while the full test (a few minutes) will cover a much broader range of settings to ensure everything is correct. diff --git a/docs/installation.md b/docs/installation.md index af2443ea9..41d08a1ea 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -4,8 +4,8 @@ - [Installation](#installation) - [Containerized Installation](#containerized-installation) - [Local/remote installation](#localremote-installation) - - [YML file installation](#yml-file-installation) - - [Manual installation](#manual-installation) + - [YML file installation (recommended)](#yml-file-installation-recommended) + - [Manual installation (not recommended)](#manual-installation-not-recommended) - [Testing DeepRank2 installation](#testing-deeprank2-installation) - [Contributing](#contributing) @@ -46,7 +46,7 @@ Local installation is formally only supported on the latest stable release of ub Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -#### YML file installation +#### YML file installation (recommended) You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) containing the latest stable release of DeepRank2 and all its dependencies. This will install the CPU-only version of DeepRank2 on Python 3.10. @@ -60,27 +60,28 @@ cd deeprank2 # Ensure you are in your base environment conda activate # Create the environment -conda env create -f env/environment.yml +conda env create -f env/deeprank2.yml # Activate the environment conda activate deeprank2 +# Install the latest deeprank2 release +pip install deeprank2 ``` See instructions below to [test](#testing-deeprank2-installation) that the installation was succesful. -### Manual installation +#### Manual installation (not recommended) -If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not succesful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Jan 2024): +If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not successful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -- [DSSP 4](https://anaconda.org/sbl/dssp): `conda install -c sbl dssp` -- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms` - - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. -- [PyTorch](https://pytorch.org/get-started/locally/): `conda install pytorch torchvision torchaudio cpuonly -c pytorch` - - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). +You can first remove from `env/deeprank2.yml` the packages that cannot be installed properly, or the ones that you want to install differently (e.g., pytorch-related packages if you wish to install the CUDA version), and then proceed with the environment creation by using the edited YML file: `conda env create -f env/deeprank2.yml`. Then activate the environment, and proceed with installing the missing packages, which might fall into the following list. If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Feb 2024): + +- [MSMS](https://anaconda.org/bioconda/msms): [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. +- [PyTorch](https://pytorch.org/get-started/locally/) + - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested on ubuntu using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). - We support torch's CPU library as well as CUDA. - [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. - The exact command to install pyg will depend on the version of pytorch you are using. Please refer to the source's installation instructions (we recommend using the pip installation for this as it also shows the command for the dependencies). -- For MacOS with M1 chip users: install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). +- [FreeSASA](https://freesasa.github.io/python/). Finally install deeprank2 itself: `pip install deeprank2`. @@ -94,9 +95,9 @@ pip install -e .'[test]' The `test` extra is optional, and can be used to install test-related dependencies, useful during development. -### Testing DeepRank2 installation +#### Testing DeepRank2 installation -You can check that all components were installed correctly, using pytest. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). +You can check that all components were installed correctly, using `pytest`. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). The quick test should be sufficient to ensure that the software works, while the full test (a few minutes) will cover a much broader range of settings to ensure everything is correct. From 0796bc4b3ceff05c8a684ca02595ba7579a1829f Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 19 Feb 2024 17:14:15 +0100 Subject: [PATCH 26/61] test for rerunning codacy --- test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 000000000..e69de29bb From 7d4247da1c9bb91b1b28f09b24716fa8d2c2e698 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 19 Feb 2024 17:14:45 +0100 Subject: [PATCH 27/61] remove test file --- test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index e69de29bb..000000000 From 27edea25f759e3759df83f9713c7fe90fff733b1 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 22 Feb 2024 14:58:08 +0100 Subject: [PATCH 28/61] update deeprank2 minimum version for docker --- env/requirements-docker.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/env/requirements-docker.txt b/env/requirements-docker.txt index f03a7051a..44eba4768 100644 --- a/env/requirements-docker.txt +++ b/env/requirements-docker.txt @@ -1 +1 @@ -deeprank2>=3.0.0 \ No newline at end of file +deeprank2>=3.0.1 \ No newline at end of file From d5ede9fbfc15d47ebe613f388256c579c89fbffe Mon Sep 17 00:00:00 2001 From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> Date: Mon, 26 Feb 2024 16:33:26 +0100 Subject: [PATCH 29/61] Update README.md Co-authored-by: Dani Bodor --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fce07b6af..df17f0712 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ pip install deeprank2 See instructions below to [test](#testing-deeprank2-installation) that the installation was succesful. -#### Manual installation (not recommended) +#### Manual installation (customizable) If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not successful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). From f755d41bc75a948669938548d2b64bf445d70b12 Mon Sep 17 00:00:00 2001 From: Giulia Crocioni Date: Mon, 26 Feb 2024 16:36:50 +0100 Subject: [PATCH 30/61] uniform installation title --- docs/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/installation.md b/docs/installation.md index 41d08a1ea..da06b1132 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -69,7 +69,7 @@ pip install deeprank2 See instructions below to [test](#testing-deeprank2-installation) that the installation was succesful. -#### Manual installation (not recommended) +#### Manual installation (customizable) If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not successful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). From 9e30a02ed50bd14d91c5e8439f61981376916beb Mon Sep 17 00:00:00 2001 From: Giulia Crocioni Date: Tue, 27 Feb 2024 16:46:08 +0100 Subject: [PATCH 31/61] add python version specification --- env/deeprank2.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/env/deeprank2.yml b/env/deeprank2.yml index e0e423054..846cb2a9e 100644 --- a/env/deeprank2.yml +++ b/env/deeprank2.yml @@ -7,6 +7,7 @@ channels: - conda-forge - sbl dependencies: + - python==3.10 - pip>=23.3 - notebook>=7.0.6 - sbl::libcifpp>=5.1.0 From b4a8bc9439144cd1e8c94c2e2c64976a02719d0c Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 27 Feb 2024 16:49:06 +0100 Subject: [PATCH 32/61] update the section title --- README.md | 2 +- docs/installation.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index df17f0712..910d87c7c 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Main features: - [Containerized Installation](#containerized-installation) - [Local/remote installation](#localremote-installation) - [YML file installation (recommended)](#yml-file-installation-recommended) - - [Manual installation (not recommended)](#manual-installation-not-recommended) + - [Manual installation (customizable)](#manual-installation-customizable) - [Testing DeepRank2 installation](#testing-deeprank2-installation) - [Contributing](#contributing) - [Using DeepRank2](#using-deeprank2) diff --git a/docs/installation.md b/docs/installation.md index da06b1132..394a28d09 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -4,9 +4,9 @@ - [Installation](#installation) - [Containerized Installation](#containerized-installation) - [Local/remote installation](#localremote-installation) - - [YML file installation (recommended)](#yml-file-installation-recommended) - - [Manual installation (not recommended)](#manual-installation-not-recommended) - - [Testing DeepRank2 installation](#testing-deeprank2-installation) + - [YML file installation (recommended)](#yml-file-installation-recommended) + - [Manual installation (customizable)](#manual-installation-customizable) + - [Testing DeepRank2 installation](#testing-deeprank2-installation) - [Contributing](#contributing) # Installation From 61abb2539bcd0cc19189cdf8bd59eff2508cbd9e Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 1 Mar 2024 02:36:28 +0100 Subject: [PATCH 33/61] style: upgrade ruff to 0.3.0 --- .github/workflows/linting.yml | 2 +- CONTRIBUTING.rst | 2 +- README.dev.md | 2 +- env/deeprank2-docker.yml | 2 +- env/deeprank2.yml | 2 +- pyproject.toml | 31 +++++++++++++++---------------- 6 files changed, 20 insertions(+), 21 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index cdfdeafe2..845fdd378 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -51,4 +51,4 @@ jobs: python-version: ${{ matrix.python-version }} extras-require: test - name: Check style against standards using ruff - run: ruff . + run: ruff check . diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 7169f5a1a..901040a17 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -37,7 +37,7 @@ You want to make some kind of change to the code base #. if needed, fork the repository to your own Github profile and create your own feature branch off of the latest main commit. While working on your feature branch, make sure to stay up to date with the main branch by pulling in changes, possibly from the 'upstream' repository (follow the instructions `here `__ and `here `__); #. make sure the existing tests still work by running ``python setup.py test``; #. add your own tests (if necessary); -#. ensure the code is correctly linted (``ruff .``) and formatted (``ruff format .``); +#. ensure the code is correctly linted (``ruff check.``) and formatted (``ruff format .``); #. see our `developer's readme `_ for detailed information on our style conventions, etc.; #. update or expand the documentation; #. `push `_ your feature branch to (your fork of) the DeepRank2 repository on GitHub; diff --git a/README.dev.md b/README.dev.md index f4eb2b56f..daf09107f 100644 --- a/README.dev.md +++ b/README.dev.md @@ -48,7 +48,7 @@ We use [ruff](https://docs.astral.sh/ruff/) for linting, sorting imports and for If you are using VS code, please install and activate the [Ruff extension](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) to automatically format and check linting. -Otherwise, please ensure check both linting (`ruff fix .`) and formatting (`ruff format .`) before requesting a review. +Otherwise, please ensure check both linting (`ruff check .`) and formatting (`ruff format .`) before requesting a review. We use [prettier](https://prettier.io/) for formatting most other files. If you are editing or adding non-python files and using VS code, the [Prettier extension](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode) can be installed to auto-format these files as well. diff --git a/env/deeprank2-docker.yml b/env/deeprank2-docker.yml index 9346f1855..440daf9c3 100644 --- a/env/deeprank2-docker.yml +++ b/env/deeprank2-docker.yml @@ -38,7 +38,7 @@ dependencies: - freesasa>=2.1.0 - tensorboard>=0.9.0 - protobuf>=3.20.1 - - ruff>=0.1.13 + - ruff>=0.3.0 - dill>=0.3.8 - pyarrow>=15.0.0 - pip: diff --git a/env/deeprank2.yml b/env/deeprank2.yml index 846cb2a9e..6127fcb66 100644 --- a/env/deeprank2.yml +++ b/env/deeprank2.yml @@ -38,6 +38,6 @@ dependencies: - freesasa>=2.1.0 - tensorboard>=0.9.0 - protobuf>=3.20.1 - - ruff>=0.1.13 + - ruff>=0.3.0 - dill>=0.3.8 - pyarrow>=15.0.0 diff --git a/pyproject.toml b/pyproject.toml index f5047020f..7ed740eb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,8 @@ exclude = ["tests*", "*tests.*", "*tests"] [tool.ruff] line-length = 159 + +[tool.ruff.lint] select = ["ALL"] ignore = [ # Unrealistic for this code base @@ -105,22 +107,9 @@ ignore = [ "D413", # Missing blank line after last section ] -# Allow autofix for all enabled rules. +# Autofix settings fixable = ["ALL"] unfixable = ["F401"] # unused imports (should not disappear while editing) - -[tool.ruff.lint.per-file-ignores] -"tests/*" = [ - "S101", # Use of `assert` detected - "PLR2004", # Magic value used in comparison - "D101", # Missing class docstring - "D102", # Missing docstring in public method - "D103", # Missing docstring in public function -] -"docs/*" = ["ALL"] -"tests/perf/*" = ["T201"] # Use of print statements - -[tool.ruff.lint] extend-safe-fixes = [ "D415", # First line should end with a period, question mark, or exclamation point "D300", # Use triple double quotes `"""` @@ -132,5 +121,15 @@ extend-safe-fixes = [ "B006", # Mutable default argument ] -[tool.ruff.isort] -known-first-party = ["deeprank2"] +isort.known-first-party = ["deeprank2"] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = [ + "S101", # Use of `assert` detected + "PLR2004", # Magic value used in comparison + "D101", # Missing class docstring + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function +] +"docs/*" = ["ALL"] +"tests/perf/*" = ["T201"] # Use of print statements From b07706fbc13d8b5fbf57e3bd346801bc66499167 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 1 Mar 2024 02:38:42 +0100 Subject: [PATCH 34/61] style: ignore SLF001 (private member access) in tests folder instead of doing this line by line many times over --- pyproject.toml | 1 + tests/domain/test_forcefield.py | 2 +- tests/features/__init__.py | 2 +- tests/features/test_contact.py | 2 +- tests/molstruct/test_structure.py | 2 +- tests/test_querycollection.py | 6 +++--- tests/utils/test_buildgraph.py | 8 ++++---- tests/utils/test_graph.py | 2 +- tests/utils/test_pssmdata.py | 2 +- 9 files changed, 14 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7ed740eb6..ebdccd282 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -130,6 +130,7 @@ isort.known-first-party = ["deeprank2"] "D101", # Missing class docstring "D102", # Missing docstring in public method "D103", # Missing docstring in public function + "SLF001", # private member access ] "docs/*" = ["ALL"] "tests/perf/*" = ["T201"] # Use of print statements diff --git a/tests/domain/test_forcefield.py b/tests/domain/test_forcefield.py index 4db2c6db7..ee067aaf2 100644 --- a/tests/domain/test_forcefield.py +++ b/tests/domain/test_forcefield.py @@ -10,7 +10,7 @@ def test_atomic_forcefield() -> None: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() # The arginine C-zeta should get a positive charge arg = next(r for r in structure.get_chain("A").residues if r.amino_acid == arginine) diff --git a/tests/features/__init__.py b/tests/features/__init__.py index 541e5c777..921444cac 100644 --- a/tests/features/__init__.py +++ b/tests/features/__init__.py @@ -55,7 +55,7 @@ def build_testgraph( # noqa: C901 try: structure: PDBStructure = get_structure(pdb, Path(pdb_path).stem) finally: - pdb._close() # noqa: SLF001 + pdb._close() if not central_res: nodes = set() diff --git a/tests/features/test_contact.py b/tests/features/test_contact.py index d9e9f001a..94766321a 100644 --- a/tests/features/test_contact.py +++ b/tests/features/test_contact.py @@ -43,7 +43,7 @@ def _get_contact( try: structure = get_structure(pdb, pdb_id) finally: - pdb._close() # noqa: SLF001 + pdb._close() if not chains: chains = [structure.chains[0], structure.chains[0]] diff --git a/tests/molstruct/test_structure.py b/tests/molstruct/test_structure.py index 6074fc265..03a906a9e 100644 --- a/tests/molstruct/test_structure.py +++ b/tests/molstruct/test_structure.py @@ -12,7 +12,7 @@ def _get_structure(path: str) -> PDBStructure: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() assert structure is not None diff --git a/tests/test_querycollection.py b/tests/test_querycollection.py index 904796634..d93e187d7 100644 --- a/tests/test_querycollection.py +++ b/tests/test_querycollection.py @@ -279,6 +279,6 @@ def test_querycollection_duplicates_add() -> None: "1ATN_2w_2", "1ATN_3w", ] - assert queries._ids_count["residue-ppi:A-B:1ATN_1w"] == 3 # noqa: SLF001 - assert queries._ids_count["residue-ppi:A-B:1ATN_2w"] == 2 # noqa: SLF001 - assert queries._ids_count["residue-ppi:A-B:1ATN_3w"] == 1 # noqa: SLF001 + assert queries._ids_count["residue-ppi:A-B:1ATN_1w"] == 3 + assert queries._ids_count["residue-ppi:A-B:1ATN_2w"] == 2 + assert queries._ids_count["residue-ppi:A-B:1ATN_3w"] == 1 diff --git a/tests/utils/test_buildgraph.py b/tests/utils/test_buildgraph.py index 35559c6c2..a57e7ec74 100644 --- a/tests/utils/test_buildgraph.py +++ b/tests/utils/test_buildgraph.py @@ -12,7 +12,7 @@ def test_get_structure_complete() -> None: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() assert structure is not None @@ -40,7 +40,7 @@ def test_get_structure_from_nmr_with_dna() -> None: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() assert structure is not None assert structure.chains[0].residues[0].amino_acid is None # DNA @@ -52,7 +52,7 @@ def test_residue_contact_pairs() -> None: try: structure = get_structure(pdb, "1ATN") finally: - pdb._close() # noqa: SLF001 + pdb._close() residue_pairs = get_residue_contact_pairs(pdb_path, structure, "A", "B", 8.5) assert len(residue_pairs) > 0 @@ -64,7 +64,7 @@ def test_surrounding_residues() -> None: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() all_residues = structure.get_chain("A").residues # A nicely centered residue diff --git a/tests/utils/test_graph.py b/tests/utils/test_graph.py index b792ff29b..39bd2c9c8 100644 --- a/tests/utils/test_graph.py +++ b/tests/utils/test_graph.py @@ -35,7 +35,7 @@ def graph() -> Graph: try: structure = get_structure(pdb, entry_id) finally: - pdb._close() # noqa: SLF001 + pdb._close() # build a contact from two residues residue0 = structure.chains[0].residues[0] diff --git a/tests/utils/test_pssmdata.py b/tests/utils/test_pssmdata.py index 45d7cf0a6..262a517d4 100644 --- a/tests/utils/test_pssmdata.py +++ b/tests/utils/test_pssmdata.py @@ -10,7 +10,7 @@ def test_add_pssm() -> None: try: structure = get_structure(pdb, "1ATN") finally: - pdb._close() # noqa: SLF001 + pdb._close() for chain in structure.chains: with open(f"tests/data/pssm/1ATN/1ATN.{chain.id}.pdb.pssm", encoding="utf-8") as f: From 5db82a517f4eef48be0012459ae564402cbe837d Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 1 Mar 2024 02:40:04 +0100 Subject: [PATCH 35/61] style: remove obsolete explanation of noqa these can be seen by hovering over the comment on the ignored rule. --- deeprank2/trainer.py | 2 +- deeprank2/utils/community_pooling.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deeprank2/trainer.py b/deeprank2/trainer.py index 56b5637ad..e133dbcaa 100644 --- a/deeprank2/trainer.py +++ b/deeprank2/trainer.py @@ -21,7 +21,7 @@ from deeprank2.utils.earlystopping import EarlyStopping from deeprank2.utils.exporters import HDF5OutputExporter, OutputExporter, OutputExporterCollection -# ruff: noqa: PYI041 (redundant-numeric-union), they are used differently in this module +# ruff: noqa: PYI041 (usage depends on type in this module) _log = logging.getLogger(__name__) diff --git a/deeprank2/utils/community_pooling.py b/deeprank2/utils/community_pooling.py index 04384268f..553ab4b4f 100644 --- a/deeprank2/utils/community_pooling.py +++ b/deeprank2/utils/community_pooling.py @@ -11,7 +11,7 @@ from torch_geometric.nn.pool.pool import pool_batch, pool_edge from torch_scatter import scatter_max, scatter_mean -# ruff: noqa: ANN001, ANN201 (missing type hints and return types) +# ruff: noqa: ANN001, ANN201 def plot_graph(graph, cluster) -> None: # noqa:D103 From 6e17343ef9eee1a8da6c735b1f527c3a6b4bfd36 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 1 Mar 2024 02:42:18 +0100 Subject: [PATCH 36/61] style: activate in-file testing for VS-code with this setting, a "play" button will appear in the margin of unit tests, which can run that particular test on the fly. --- .vscode/settings.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.vscode/settings.json b/.vscode/settings.json index 6b68b7366..5cf7354a7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -17,6 +17,10 @@ }, "notebook.diff.ignoreMetadata": true, + // Pytest + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + // Format all files on save "editor.formatOnSave": true, "editor.defaultFormatter": "esbenp.prettier-vscode", From 5ead4f52331f2183a88931dc989c67aa22e1a02e Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 1 Mar 2024 02:50:10 +0100 Subject: [PATCH 37/61] style: make return tuple explicit --- tests/perf/ppi_perf.py | 2 +- tests/perf/srv_perf.py | 2 +- tests/test_dataset.py | 2 +- tests/test_querycollection.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/perf/ppi_perf.py b/tests/perf/ppi_perf.py index f1e48e356..8b5d687ad 100644 --- a/tests/perf/ppi_perf.py +++ b/tests/perf/ppi_perf.py @@ -41,7 +41,7 @@ os.makedirs(os.path.join(processed_data_path, "atomic")) -def get_pdb_files_and_target_data(data_path: str) -> (list[str], list): +def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list]: csv_data = pd.read_csv(os.path.join(data_path, "BA_values.csv")) pdb_files = glob.glob(os.path.join(data_path, "pdb", "*.pdb")) pdb_files.sort() diff --git a/tests/perf/srv_perf.py b/tests/perf/srv_perf.py index 9de2d8268..94be374ee 100644 --- a/tests/perf/srv_perf.py +++ b/tests/perf/srv_perf.py @@ -87,7 +87,7 @@ os.makedirs(os.path.join(processed_data_path, "atomic")) -def get_pdb_files_and_target_data(data_path: str) -> (list[str], list, list, list, list): +def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list, list, list, list]: csv_data = pd.read_csv(os.path.join(data_path, "srv_target_values.csv")) # before running this script change .ent to .pdb pdb_files = glob.glob(os.path.join(data_path, "pdb", "*.pdb")) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 49f1b4b3b..932e7d3c9 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -31,7 +31,7 @@ def _compute_features_manually( hdf5_path: str, features_transform: dict, feat: str, -) -> (NDArray, float, float): +) -> tuple[NDArray, float, float]: """Return specified feature. This function returns the feature specified read from the hdf5 file, after applying manually features_transform dict. diff --git a/tests/test_querycollection.py b/tests/test_querycollection.py index d93e187d7..7b4694657 100644 --- a/tests/test_querycollection.py +++ b/tests/test_querycollection.py @@ -21,7 +21,7 @@ def _querycollection_tester( feature_modules: ModuleType | list[ModuleType] | None = None, cpu_count: int = 1, combine_output: bool = True, -) -> (QueryCollection, str, list[str]): +) -> tuple[QueryCollection, str, list[str]]: """ Generic function to test QueryCollection class. From 573bbc1f845ac6bfb279a735b4c4b0407f4a2c2d Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 4 Mar 2024 11:52:51 +0100 Subject: [PATCH 38/61] add ppi and svr usage in the readme --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c52f3eadf..cb5dfb59c 100644 --- a/README.md +++ b/README.md @@ -165,13 +165,21 @@ For more details, see the [extended documentation](https://deeprank2.rtfd.io/). For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. -A `Query` takes as inputs: +The `Query` parent class takes as inputs: -- a `.pdb` file, representing the protein-protein structure, +- a `.pdb` file, representing the molecular structure, - the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom, - the ids of the chains composing the structure, and - optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. +Then in particular, for the `SingleResidueVariantQuery` child class: + +- `chain_ids` represents the chain identifier of the variant residue (generally a single capital letter). Note that this does not limit the structure to residues from this chain. The structure contained in the `.pdb` can thus have any number of chains. + +For the `ProteinProteinInterfaceQuery` child class: + +- `chain_ids` represents the chain identifiers of the interacting interfaces (generally a single capital letter each). Note that this does not limit the structure to residues from these chains. But `chain_ids` must contain exactly 2 chains, since right now the code-base handles mono interfaces only. + ```python from deeprank2.query import QueryCollection, ProteinProteinInterfaceQuery From 21e74947b3c60edae3163f2a6b3dee1e9a2216b0 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 4 Mar 2024 11:53:39 +0100 Subject: [PATCH 39/61] update docs/ --- docs/getstarted.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/getstarted.md b/docs/getstarted.md index 9a98462d7..11e69ffd7 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -8,13 +8,21 @@ For more details, see the [extended documentation](https://deeprank2.rtfd.io/). For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. -A `Query` takes as inputs: +The `Query` parent class takes as inputs: -- a `.pdb` file, representing the protein-protein structure, +- a `.pdb` file, representing the molecular structure, - the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom, - the ids of the chains composing the structure, and - optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. +Then in particular, for the `SingleResidueVariantQuery` child class: + +- `chain_ids` represents the chain identifier of the variant residue (generally a single capital letter). Note that this does not limit the structure to residues from this chain. The structure contained in the `.pdb` can thus have any number of chains. + +For the `ProteinProteinInterfaceQuery` child class: + +- `chain_ids` represents the chain identifiers of the interacting interfaces (generally a single capital letter each). Note that this does not limit the structure to residues from these chains. But `chain_ids` must contain exactly 2 chains, since right now the code-base handles mono interfaces only. + ```python from deeprank2.query import QueryCollection, ProteinProteinInterfaceQuery From 843685e1b486ad212268b0d9a3f19370ce41a8c6 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Mon, 4 Mar 2024 15:45:03 +0100 Subject: [PATCH 40/61] ci: check code formatting with ruff in github action --- .github/workflows/linting.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 845fdd378..4f72ac07f 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -51,4 +51,6 @@ jobs: python-version: ${{ matrix.python-version }} extras-require: test - name: Check style against standards using ruff - run: ruff check . + run: | + ruff check + ruff format --check From 0f574b235361077a4dbc43111b7352430862d6ea Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 4 Mar 2024 16:45:17 +0100 Subject: [PATCH 41/61] add action for testing tutorial notebooks --- .github/workflows/notebooks.yml | 57 +++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 58 insertions(+) create mode 100644 .github/workflows/notebooks.yml diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml new file mode 100644 index 000000000..c994233ef --- /dev/null +++ b/.github/workflows/notebooks.yml @@ -0,0 +1,57 @@ +name: notebooks + +on: + push: + paths-ignore: + # specific folder locations + - ".vscode/**" + - "docs/**" + # filetypes + - "**.md" + - "**.rst" + - "**.ipynb" + - "**.cff" + - "**.png" + branches: + - main + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + paths-ignore: + # specific folder locations + - ".vscode/**" + - "docs/**" + # filetypes + - "**.md" + - "**.rst" + - "**.ipynb" + - "**.cff" + - "**.png" + +jobs: + build: + if: github.event.pull_request.draft == false + name: Build for (${{ matrix.python-version }}, ${{ matrix.os }}) + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + python-version: ["3.10"] # ["3.10", "3.11"] + + # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell + defaults: + run: + shell: bash -l {0} + + steps: + - uses: actions/checkout@v3 + + - uses: ./.github/actions/install-python-and-package + with: + python-version: ${{ matrix.python-version }} + extras-require: test, notebooks + pkg-installation-type: "repository" + + - name: Run tutorial notebooks + run: pytest --nbmake tutorials diff --git a/pyproject.toml b/pyproject.toml index ebdccd282..7978fdbfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ test = [ "coveralls", ] publishing = ["build", "twine", "wheel"] +notebooks = ["nbmake"] [project.urls] Documentation = "https://deeprank2.readthedocs.io/en/latest/?badge=latest" From b64c64cc5ab225a8644d0e38536ac11efe869770 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 4 Mar 2024 16:55:18 +0100 Subject: [PATCH 42/61] add step for downloading data --- .github/workflows/notebooks.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index c994233ef..94164add1 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -53,5 +53,12 @@ jobs: extras-require: test, notebooks pkg-installation-type: "repository" + - name: Download the data for the tutorials + shell: bash -l {0} + run: | + wget https://zenodo.org/records/8349335/files/data_raw.zip + unzip data_raw.zip -d data_raw + mv data_raw tutorials + - name: Run tutorial notebooks run: pytest --nbmake tutorials From ce28031411654bb36cf3001f88e5a050110f8eeb Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 4 Mar 2024 17:00:44 +0100 Subject: [PATCH 43/61] skip execution of the cell with nans --- tutorials/training.ipynb | 1544 +++++++++++++++++++------------------- 1 file changed, 774 insertions(+), 770 deletions(-) diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index 784bc03c7..ae8870c47 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -1,772 +1,776 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training Neural Networks\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "\n", - "\n", - "This tutorial will demonstrate the use of DeepRank2 for training graph neural networks (GNNs) and convolutional neural networks (CNNs) using protein-protein interface (PPI) or single-residue variant (SRV) data for classification and regression predictive tasks.\n", - "\n", - "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/8349335). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", - "\n", - "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Input data\n", - "\n", - "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", - "\n", - "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", - "\n", - "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utilities\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Libraries\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The libraries needed for this tutorial:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import glob\n", - "import os\n", - "import h5py\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score\n", - "import plotly.express as px\n", - "import torch\n", - "import numpy as np\n", - "\n", - "np.seterr(divide=\"ignore\")\n", - "np.seterr(invalid=\"ignore\")\n", - "import pandas as pd\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "from deeprank2.dataset import GraphDataset, GridDataset\n", - "from deeprank2.trainer import Trainer\n", - "from deeprank2.neuralnets.gnn.naive_gnn import NaiveNetwork\n", - "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", - "from deeprank2.utils.exporters import HDF5OutputExporter\n", - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Paths and sets\n", - "\n", - "The paths for reading the processed data:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_type = \"ppi\"\n", - "level = \"residue\"\n", - "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", - "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", - "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", - "\n", - "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_dict = {}\n", - "df_dict[\"entry\"] = []\n", - "df_dict[\"target\"] = []\n", - "for fname in input_data_path:\n", - " with h5py.File(fname, \"r\") as hdf5:\n", - " for mol in hdf5.keys():\n", - " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", - " df_dict[\"entry\"].append(mol)\n", - " df_dict[\"target\"].append(target_value)\n", - "\n", - "df = pd.DataFrame(data=df_dict)\n", - "df.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", - "\n", - "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", - "\n", - "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", - "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", - "\n", - "print(f\"Data statistics:\\n\")\n", - "print(f\"Total samples: {len(df)}\\n\")\n", - "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", - "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", - "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", - "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", - "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", - "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Classification example\n", - "\n", - "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### GNN\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### GraphDataset\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", - "\n", - "A few notes about `GraphDataset` parameters:\n", - "\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", - "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", - "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", - " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", - " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", - " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", - " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", - " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target = \"binary\"\n", - "task = \"classif\"\n", - "node_features = [\"res_type\"]\n", - "edge_features = [\"distance\"]\n", - "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", - "\n", - "print(\"Loading training data...\")\n", - "dataset_train = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " node_features=node_features,\n", - " edge_features=edge_features,\n", - " features_transform=features_transform,\n", - " target=target,\n", - " task=task,\n", - ")\n", - "print(\"\\nLoading validation data...\")\n", - "dataset_val = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source=dataset_train,\n", - ")\n", - "print(\"\\nLoading test data...\")\n", - "dataset_test = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source=dataset_train,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Trainer\n", - "\n", - "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A few notes about `Trainer` parameters:\n", - "\n", - "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `NaiveNetwork` (implemented in `deeprank2.neuralnets.gnn.naive_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", - "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", - "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", - "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer = Trainer(\n", - " neuralnet=NaiveNetwork,\n", - " dataset_train=dataset_train,\n", - " dataset_val=dataset_val,\n", - " dataset_test=dataset_test,\n", - " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = torch.optim.SGD\n", - "lr = 1e-3\n", - "weight_decay = 0.001\n", - "\n", - "trainer.configure_optimizers(optimizer, lr, weight_decay)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default loss function for classification is `torch.nn.CrossEntropyLoss` and for regression it is `torch.nn.MSELoss`. It is also possible to set some other PyTorch loss functions by using `Trainer.set_lossfunction` method, although not all are currently implemented.\n", - "\n", - "Then the model can be trained using the `train()` method of the `Trainer` class.\n", - "\n", - "A few notes about `train()` method parameters:\n", - "\n", - "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", - "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", - "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "epochs = 20\n", - "batch_size = 8\n", - "earlystop_patience = 5\n", - "earlystop_maxgap = 0.1\n", - "min_epoch = 10\n", - "\n", - "trainer.train(\n", - " nepoch=epochs,\n", - " batch_size=batch_size,\n", - " earlystop_patience=earlystop_patience,\n", - " earlystop_maxgap=earlystop_maxgap,\n", - " min_epoch=min_epoch,\n", - " validate=True,\n", - " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", - ")\n", - "\n", - "epoch = trainer.epoch_saved_model\n", - "print(f\"Model saved at epoch {epoch}\")\n", - "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f\"Total # of parameters: {pytorch_total_params}\")\n", - "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Testing\n", - "\n", - "And the trained model can be tested on `dataset_test`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Results visualization\n", - "\n", - "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", - "\n", - "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_train = pd.read_hdf(\n", - " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", - ")\n", - "output_test = pd.read_hdf(\n", - " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", - ")\n", - "output_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", - "\n", - "For example, the loss across the epochs can be plotted for the training and the validation sets:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", - "\n", - "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", - "\n", - "fig.update_layout(\n", - " xaxis_title=\"Epoch #\",\n", - " yaxis_title=\"Loss\",\n", - " title=\"Loss vs epochs - GNN training\",\n", - " width=700,\n", - " height=400,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 0.5\n", - "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", - "\n", - "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", - " df_plot_phase = df_plot[(df_plot.phase == set)]\n", - " y_true = df_plot_phase.target\n", - " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", - "\n", - " print(f\"\\nMetrics for {set}:\")\n", - " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", - " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f\"AUC: {round(auc_score, 1)}\")\n", - " print(f\"Considering a threshold of {threshold}\")\n", - " y_pred = (y_score > threshold) * 1\n", - " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", - " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", - "\n", - "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### CNN\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### GridDataset\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For training CNNs the user can create `GridDataset` instances.\n", - "\n", - "A few notes about `GridDataset` parameters:\n", - "\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", - "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target = \"binary\"\n", - "task = \"classif\"\n", - "\n", - "print(\"Loading training data...\")\n", - "dataset_train = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " target=target,\n", - " task=task,\n", - ")\n", - "print(\"\\nLoading validation data...\")\n", - "dataset_val = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source=dataset_train,\n", - ")\n", - "print(\"\\nLoading test data...\")\n", - "dataset_test = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source=dataset_train,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Trainer\n", - "\n", - "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", - "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = torch.optim.SGD\n", - "lr = 1e-3\n", - "weight_decay = 0.001\n", - "epochs = 20\n", - "batch_size = 8\n", - "earlystop_patience = 5\n", - "earlystop_maxgap = 0.1\n", - "min_epoch = 10\n", - "\n", - "trainer = Trainer(\n", - " neuralnet=CnnClassification,\n", - " dataset_train=dataset_train,\n", - " dataset_val=dataset_val,\n", - " dataset_test=dataset_test,\n", - " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", - ")\n", - "\n", - "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", - "\n", - "trainer.train(\n", - " nepoch=epochs,\n", - " batch_size=batch_size,\n", - " earlystop_patience=earlystop_patience,\n", - " earlystop_maxgap=earlystop_maxgap,\n", - " min_epoch=min_epoch,\n", - " validate=True,\n", - " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", - ")\n", - "\n", - "epoch = trainer.epoch_saved_model\n", - "print(f\"Model saved at epoch {epoch}\")\n", - "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f\"Total # of parameters: {pytorch_total_params}\")\n", - "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Testing\n", - "\n", - "And the trained model can be tested on `dataset_test`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Results visualization\n", - "\n", - "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_train = pd.read_hdf(\n", - " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", - ")\n", - "output_test = pd.read_hdf(\n", - " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", - ")\n", - "output_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", - "\n", - "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", - "\n", - "fig.update_layout(\n", - " xaxis_title=\"Epoch #\",\n", - " yaxis_title=\"Loss\",\n", - " title=\"Loss vs epochs - CNN training\",\n", - " width=700,\n", - " height=400,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And some metrics of interest for classification tasks:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 0.5\n", - "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", - "\n", - "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", - " df_plot_phase = df_plot[(df_plot.phase == set)]\n", - " y_true = df_plot_phase.target\n", - " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", - "\n", - " print(f\"\\nMetrics for {set}:\")\n", - " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", - " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f\"AUC: {round(auc_score, 1)}\")\n", - " print(f\"Considering a threshold of {threshold}\")\n", - " y_pred = (y_score > threshold) * 1\n", - " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", - " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "deeprank2", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training Neural Networks\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "\n", + "\n", + "This tutorial will demonstrate the use of DeepRank2 for training graph neural networks (GNNs) and convolutional neural networks (CNNs) using protein-protein interface (PPI) or single-residue variant (SRV) data for classification and regression predictive tasks.\n", + "\n", + "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/8349335). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", + "\n", + "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input data\n", + "\n", + "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", + "\n", + "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "\n", + "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Utilities\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Libraries\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The libraries needed for this tutorial:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import glob\n", + "import os\n", + "import h5py\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score\n", + "import plotly.express as px\n", + "import torch\n", + "import numpy as np\n", + "\n", + "np.seterr(divide=\"ignore\")\n", + "np.seterr(invalid=\"ignore\")\n", + "import pandas as pd\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "from deeprank2.dataset import GraphDataset, GridDataset\n", + "from deeprank2.trainer import Trainer\n", + "from deeprank2.neuralnets.gnn.naive_gnn import NaiveNetwork\n", + "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", + "from deeprank2.utils.exporters import HDF5OutputExporter\n", + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Paths and sets\n", + "\n", + "The paths for reading the processed data:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_type = \"ppi\"\n", + "level = \"residue\"\n", + "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", + "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", + "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", + "\n", + "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_dict = {}\n", + "df_dict[\"entry\"] = []\n", + "df_dict[\"target\"] = []\n", + "for fname in input_data_path:\n", + " with h5py.File(fname, \"r\") as hdf5:\n", + " for mol in hdf5.keys():\n", + " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", + " df_dict[\"entry\"].append(mol)\n", + " df_dict[\"target\"].append(target_value)\n", + "\n", + "df = pd.DataFrame(data=df_dict)\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", + "\n", + "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", + "\n", + "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", + "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", + "\n", + "print(f\"Data statistics:\\n\")\n", + "print(f\"Total samples: {len(df)}\\n\")\n", + "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", + "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", + "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", + "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", + "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", + "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification example\n", + "\n", + "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GNN\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GraphDataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", + "\n", + "A few notes about `GraphDataset` parameters:\n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", + "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", + "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", + " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", + " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", + " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", + " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", + " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target = \"binary\"\n", + "task = \"classif\"\n", + "node_features = [\"res_type\"]\n", + "edge_features = [\"distance\"]\n", + "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", + "\n", + "print(\"Loading training data...\")\n", + "dataset_train = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " node_features=node_features,\n", + " edge_features=edge_features,\n", + " features_transform=features_transform,\n", + " target=target,\n", + " task=task,\n", + ")\n", + "print(\"\\nLoading validation data...\")\n", + "dataset_val = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", + ")\n", + "print(\"\\nLoading test data...\")\n", + "dataset_test = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trainer\n", + "\n", + "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A few notes about `Trainer` parameters:\n", + "\n", + "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `NaiveNetwork` (implemented in `deeprank2.neuralnets.gnn.naive_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", + "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", + "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", + "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer = Trainer(\n", + " neuralnet=NaiveNetwork,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD\n", + "lr = 1e-3\n", + "weight_decay = 0.001\n", + "\n", + "trainer.configure_optimizers(optimizer, lr, weight_decay)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default loss function for classification is `torch.nn.CrossEntropyLoss` and for regression it is `torch.nn.MSELoss`. It is also possible to set some other PyTorch loss functions by using `Trainer.set_lossfunction` method, although not all are currently implemented.\n", + "\n", + "Then the model can be trained using the `train()` method of the `Trainer` class.\n", + "\n", + "A few notes about `train()` method parameters:\n", + "\n", + "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", + "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", + "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epochs = 20\n", + "batch_size = 8\n", + "earlystop_patience = 5\n", + "earlystop_maxgap = 0.1\n", + "min_epoch = 10\n", + "\n", + "trainer.train(\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", + ")\n", + "\n", + "epoch = trainer.epoch_saved_model\n", + "print(f\"Model saved at epoch {epoch}\")\n", + "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", + "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Testing\n", + "\n", + "And the trained model can be tested on `dataset_test`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results visualization\n", + "\n", + "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", + "\n", + "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_train = pd.read_hdf(\n", + " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", + ")\n", + "output_test = pd.read_hdf(\n", + " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", + ")\n", + "output_train.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", + "\n", + "For example, the loss across the epochs can be plotted for the training and the validation sets:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", + "\n", + "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - GNN training\",\n", + " width=700,\n", + " height=400,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "skip-execution" + ] + }, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "df = pd.concat([output_train, output_test])\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", + "\n", + "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", + " df_plot_phase = df_plot[(df_plot.phase == set)]\n", + " y_true = df_plot_phase.target\n", + " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", + "\n", + " print(f\"\\nMetrics for {set}:\")\n", + " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", + " auc_score = auc(fpr_roc, tpr_roc)\n", + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", + "\n", + "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CNN\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GridDataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For training CNNs the user can create `GridDataset` instances.\n", + "\n", + "A few notes about `GridDataset` parameters:\n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", + "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target = \"binary\"\n", + "task = \"classif\"\n", + "\n", + "print(\"Loading training data...\")\n", + "dataset_train = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " target=target,\n", + " task=task,\n", + ")\n", + "print(\"\\nLoading validation data...\")\n", + "dataset_val = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", + ")\n", + "print(\"\\nLoading test data...\")\n", + "dataset_test = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trainer\n", + "\n", + "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", + "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD\n", + "lr = 1e-3\n", + "weight_decay = 0.001\n", + "epochs = 20\n", + "batch_size = 8\n", + "earlystop_patience = 5\n", + "earlystop_maxgap = 0.1\n", + "min_epoch = 10\n", + "\n", + "trainer = Trainer(\n", + " neuralnet=CnnClassification,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", + ")\n", + "\n", + "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", + "\n", + "trainer.train(\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", + ")\n", + "\n", + "epoch = trainer.epoch_saved_model\n", + "print(f\"Model saved at epoch {epoch}\")\n", + "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", + "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Testing\n", + "\n", + "And the trained model can be tested on `dataset_test`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results visualization\n", + "\n", + "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_train = pd.read_hdf(\n", + " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", + ")\n", + "output_test = pd.read_hdf(\n", + " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", + ")\n", + "output_train.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", + "\n", + "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - CNN training\",\n", + " width=700,\n", + " height=400,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And some metrics of interest for classification tasks:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "df = pd.concat([output_train, output_test])\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", + "\n", + "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", + " df_plot_phase = df_plot[(df_plot.phase == set)]\n", + " y_true = df_plot_phase.target\n", + " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", + "\n", + " print(f\"\\nMetrics for {set}:\")\n", + " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", + " auc_score = auc(fpr_roc, tpr_roc)\n", + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deeprank2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 } From 291531a724d0fd34bc3e62e3d8735dd6f7a7d690 Mon Sep 17 00:00:00 2001 From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> Date: Tue, 5 Mar 2024 10:54:21 +0100 Subject: [PATCH 44/61] Update .github/workflows/notebooks.yml Co-authored-by: Dani Bodor --- .github/workflows/notebooks.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index 94164add1..e82e88a14 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -9,7 +9,6 @@ on: # filetypes - "**.md" - "**.rst" - - "**.ipynb" - "**.cff" - "**.png" branches: From 8db7855509c736fe050333a479fdb5633d4d44c3 Mon Sep 17 00:00:00 2001 From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> Date: Tue, 5 Mar 2024 10:54:42 +0100 Subject: [PATCH 45/61] Update .github/workflows/notebooks.yml Co-authored-by: Dani Bodor --- .github/workflows/notebooks.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index e82e88a14..b1e59ea9d 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -22,7 +22,6 @@ on: # filetypes - "**.md" - "**.rst" - - "**.ipynb" - "**.cff" - "**.png" From 4e9918a000df0f089677610c693b313055011e1a Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 5 Mar 2024 11:02:05 +0100 Subject: [PATCH 46/61] add suggestions for query parameters explaination --- README.md | 23 +++++++++-------------- docs/getstarted.md | 23 +++++++++-------------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index cb5dfb59c..7fddb53f3 100644 --- a/README.md +++ b/README.md @@ -165,20 +165,15 @@ For more details, see the [extended documentation](https://deeprank2.rtfd.io/). For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. -The `Query` parent class takes as inputs: - -- a `.pdb` file, representing the molecular structure, -- the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom, -- the ids of the chains composing the structure, and -- optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. - -Then in particular, for the `SingleResidueVariantQuery` child class: - -- `chain_ids` represents the chain identifier of the variant residue (generally a single capital letter). Note that this does not limit the structure to residues from this chain. The structure contained in the `.pdb` can thus have any number of chains. - -For the `ProteinProteinInterfaceQuery` child class: - -- `chain_ids` represents the chain identifiers of the interacting interfaces (generally a single capital letter each). Note that this does not limit the structure to residues from these chains. But `chain_ids` must contain exactly 2 chains, since right now the code-base handles mono interfaces only. +A `Query` takes as inputs: + +- A `.pdb` file, representing the molecular structure. +- The resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom. +- `chain_ids`, the chain ID or IDs (generally single capital letter(s)). + - `SingleResidueVariantQuery` takes a single ID, which represents the chain containing the variant residue. + - `ProteinProteinInterfaceQuery` takes a pair of ids, which represent the chains between which the interface exists. + - Note that in either case this does not limit the structure to residues from this/these chain/s. The structure contained in the `.pdb` can thus have any number of chains, and residues from these chains will be included in the graphs and grids produced by DeepRank2 (if they are within the `influence_radius`). +- Optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. ```python from deeprank2.query import QueryCollection, ProteinProteinInterfaceQuery diff --git a/docs/getstarted.md b/docs/getstarted.md index 11e69ffd7..1f2eb561c 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -8,20 +8,15 @@ For more details, see the [extended documentation](https://deeprank2.rtfd.io/). For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. -The `Query` parent class takes as inputs: - -- a `.pdb` file, representing the molecular structure, -- the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom, -- the ids of the chains composing the structure, and -- optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. - -Then in particular, for the `SingleResidueVariantQuery` child class: - -- `chain_ids` represents the chain identifier of the variant residue (generally a single capital letter). Note that this does not limit the structure to residues from this chain. The structure contained in the `.pdb` can thus have any number of chains. - -For the `ProteinProteinInterfaceQuery` child class: - -- `chain_ids` represents the chain identifiers of the interacting interfaces (generally a single capital letter each). Note that this does not limit the structure to residues from these chains. But `chain_ids` must contain exactly 2 chains, since right now the code-base handles mono interfaces only. +A `Query` takes as inputs: + +- A `.pdb` file, representing the molecular structure. +- The resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom. +- `chain_ids`, the chain ID or IDs (generally single capital letter(s)). + - `SingleResidueVariantQuery` takes a single ID, which represents the chain containing the variant residue. + - `ProteinProteinInterfaceQuery` takes a pair of ids, which represent the chains between which the interface exists. + - Note that in either case this does not limit the structure to residues from this/these chain/s. The structure contained in the `.pdb` can thus have any number of chains, and residues from these chains will be included in the graphs and grids produced by DeepRank2 (if they are within the `influence_radius`). +- Optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. ```python from deeprank2.query import QueryCollection, ProteinProteinInterfaceQuery From a49ff6e417321b0b0b08be7a24ca27ac9863590e Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 5 Mar 2024 11:17:31 +0100 Subject: [PATCH 47/61] add explaination about the queries to the readme --- README.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7fddb53f3..310e93fe6 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,12 @@ DeepRank2 is an open-source deep learning (DL) framework for data mining of protein-protein interfaces (PPIs) or single-residue variants (SRVs). This package is an improved and unified version of three previously developed packages: [DeepRank](https://github.com/DeepRank/deeprank), [DeepRank-GNN](https://github.com/DeepRank/Deeprank-GNN), and [DeepRank-Mut](https://github.com/DeepRank/DeepRank-Mut). -DeepRank2 allows for transformation of (pdb formatted) molecular data into 3D representations (either grids or graphs) containing structural and physico-chemical information, which can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either [CNNs](https://en.wikipedia.org/wiki/Convolutional_neural_network) (for grids) or [GNNs](https://en.wikipedia.org/wiki/Graph_neural_network) (for graphs), as well as output exporters for evaluating performances. +As input, DeepRank2 takes [PDB-formatted](https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html) atomic structures, and map them to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. The user can configure two types of 3D structures as input for the featurization phase: + +- PPIs, for mining interaction patterns within protein-protein complexes, implemented by the `ProteinProteinInterfaceQuery` class; +- SRVs, for mining mutation phenotypes within protein structures, implemented by the `SingleResidueVariantQuery` class. + +The physico-chemical and geometrical features are then computed and assigned to each node and edge. The user can choose which features to generate from several pre-existing options defined in the package, or define custom features modules, as explained in the documentation. The graphs can then be mapped to 3D-grids as well. The generated data can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either [CNNs](https://en.wikipedia.org/wiki/Convolutional_neural_network) (for 3D-grids) or [GNNs](https://en.wikipedia.org/wiki/Graph_neural_network) (for graphs), as well as output exporters for evaluating performances. Main features: @@ -28,7 +33,7 @@ Main features: - binary class, CAPRI categories, DockQ, RMSD, and FNAT - Detailed docking scores documentation is available [here](https://deeprank2.readthedocs.io/en/latest/docking.html) - Flexible definition of both new features and targets -- Features generation for both graphs and grids +- Features generation for both graphs and 3D-grids - Efficient data storage in HDF5 format - Support for both classification and regression (based on [PyTorch](https://pytorch.org/) and [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/)) @@ -172,7 +177,7 @@ A `Query` takes as inputs: - `chain_ids`, the chain ID or IDs (generally single capital letter(s)). - `SingleResidueVariantQuery` takes a single ID, which represents the chain containing the variant residue. - `ProteinProteinInterfaceQuery` takes a pair of ids, which represent the chains between which the interface exists. - - Note that in either case this does not limit the structure to residues from this/these chain/s. The structure contained in the `.pdb` can thus have any number of chains, and residues from these chains will be included in the graphs and grids produced by DeepRank2 (if they are within the `influence_radius`). + - Note that in either case this does not limit the structure to residues from this/these chain/s. The structure contained in the `.pdb` can thus have any number of chains, and residues from these chains will be included in the graphs and 3D-grids produced by DeepRank2 (if they are within the `influence_radius`). - Optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. ```python @@ -222,7 +227,7 @@ queries.add(ProteinProteinInterfaceQuery( The user is free to implement a custom query class. Each implementation requires the `build` method to be present. -The queries can then be processed into graphs only or both graphs and 3D grids, depending on which kind of network will be used later for training. +The queries can then be processed into graphs only or both graphs and 3D-grids, depending on which kind of network will be used later for training. ```python from deeprank2.features import components, conservation, contact, exposure, irc, surfacearea @@ -235,7 +240,7 @@ hdf5_paths = queries.process( "/", feature_modules = feature_modules) -# Save data into 3D-graphs and 3D-grids +# Save data into graphs and 3D-grids hdf5_paths = queries.process( "/", feature_modules = feature_modules, From bf7e76294422eafdac0bf5b445a24c578ced3843 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 5 Mar 2024 11:18:24 +0100 Subject: [PATCH 48/61] update deeprank2 description for readthedocs --- docs/index.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index fe3d4084d..29a7cb0c5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,14 +4,19 @@ DeepRank2 |version| documentation DeepRank2 is an open-source deep learning (DL) framework for data mining of protein-protein interfaces (PPIs) or single-residue variants (SRVs). This package is an improved and unified version of three previously developed packages: `DeepRank`_, `DeepRank-GNN`_, and `DeepRank-Mut`_. -DeepRank2 allows for transformation of (pdb formatted) molecular data into 3D representations (either grids or graphs) containing structural and physico-chemical information, which can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either `convolutional neural networks`_ (for grids) or `graph neural networks`_ (for graphs), as well as output exporters for evaluating performances. +As input, DeepRank2 takes `PDB-formatted`_ atomic structures, and map them to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. The user can configure two types of 3D structures as input for the featurization phase: + +- PPIs, for mining interaction patterns within protein-protein complexes, implemented by the `ProteinProteinInterfaceQuery` class; +- SRVs, for mining mutation phenotypes within protein structures, implemented by the `SingleResidueVariantQuery` class. + +The physico-chemical and geometrical features are then computed and assigned to each node and edge. The user can choose which features to generate from several pre-existing options defined in the package, or define custom features modules, as explained in the documentation. The graphs can then be mapped to 3D-grids as well. The generated data can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either `convolutional neural networks`_ (for 3D-grids) or `graph neural networks`_ (for graphs), as well as output exporters for evaluating performances. Main features: * Predefined atom-level and residue-level feature types (e.g. atom/residue type, charge, size, potential energy, all features' documentation is available under `Features`_ notes) * Predefined target types (binary class, CAPRI categories, DockQ, RMSD, and FNAT, detailed docking scores documentation is available under `Docking scores`_ notes) * Flexible definition of both new features and targets -* Features generation for both graphs and grids +* Features generation for both graphs and 3D-grids * Efficient data storage in HDF5 format * Support both classification and regression (based on `PyTorch`_ and `PyTorch Geometric`_) @@ -24,6 +29,7 @@ Main features: .. _Docking scores: https://deeprank2.readthedocs.io/en/latest/docking.html .. _PyTorch: https://pytorch.org/docs/stable/index.html .. _PyTorch Geometric: https://pytorch-geometric.readthedocs.io/en/latest/ +.. _PDB-formatted: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html Getting started =========== From 277d1b4f591b8dd2f433a93061527e502edbeb36 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 7 Mar 2024 11:10:42 +0100 Subject: [PATCH 49/61] add suggestion --- README.md | 2 +- docs/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 310e93fe6..4482c0d70 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ DeepRank2 is an open-source deep learning (DL) framework for data mining of protein-protein interfaces (PPIs) or single-residue variants (SRVs). This package is an improved and unified version of three previously developed packages: [DeepRank](https://github.com/DeepRank/deeprank), [DeepRank-GNN](https://github.com/DeepRank/Deeprank-GNN), and [DeepRank-Mut](https://github.com/DeepRank/DeepRank-Mut). -As input, DeepRank2 takes [PDB-formatted](https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html) atomic structures, and map them to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. The user can configure two types of 3D structures as input for the featurization phase: +As input, DeepRank2 takes [PDB-formatted](https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html) atomic structures, and map them to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. DeepRank2 has the option to choose between two types of queries as input for the featurization phase: - PPIs, for mining interaction patterns within protein-protein complexes, implemented by the `ProteinProteinInterfaceQuery` class; - SRVs, for mining mutation phenotypes within protein structures, implemented by the `SingleResidueVariantQuery` class. diff --git a/docs/index.rst b/docs/index.rst index 29a7cb0c5..689aefe4f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,7 +4,7 @@ DeepRank2 |version| documentation DeepRank2 is an open-source deep learning (DL) framework for data mining of protein-protein interfaces (PPIs) or single-residue variants (SRVs). This package is an improved and unified version of three previously developed packages: `DeepRank`_, `DeepRank-GNN`_, and `DeepRank-Mut`_. -As input, DeepRank2 takes `PDB-formatted`_ atomic structures, and map them to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. The user can configure two types of 3D structures as input for the featurization phase: +As input, DeepRank2 takes `PDB-formatted`_ atomic structures, and map them to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. DeepRank2 has the option to choose between two types of queries as input for the featurization phase: - PPIs, for mining interaction patterns within protein-protein complexes, implemented by the `ProteinProteinInterfaceQuery` class; - SRVs, for mining mutation phenotypes within protein structures, implemented by the `SingleResidueVariantQuery` class. From 8cda6679386817823ad1fab5e1b88d35d923c087 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 18 Mar 2024 11:29:28 +0100 Subject: [PATCH 50/61] add opls info to the warning --- deeprank2/utils/parsing/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deeprank2/utils/parsing/__init__.py b/deeprank2/utils/parsing/__init__.py index 1d253eee0..9f61c4dd3 100644 --- a/deeprank2/utils/parsing/__init__.py +++ b/deeprank2/utils/parsing/__init__.py @@ -65,7 +65,7 @@ def get_vanderwaals_parameters(self, atom: Atom) -> VanderwaalsParam: type_ = action["TYPE"] if type_ is None: - _log.warning(f"Atom {atom} is unknown to the forcefield; vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0)") + _log.warning(f"Atom {atom} is unknown to OPLS, the forcefield used in deeprank2; vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0)") return VanderwaalsParam(0.0, 0.0, 0.0, 0.0) return self._vanderwaals_parameters[type_] @@ -94,7 +94,7 @@ def get_charge(self, atom: Atom) -> float: charge = float(action["CHARGE"]) if charge is None: - _log.warning(f"Atom {atom} is unknown to the forcefield; charge is set to 0.0") + _log.warning(f"Atom {atom} is unknown to OPLS, the forcefield used in deeprank2; charge is set to 0.0") return 0.0 return charge From 369f2ccea1794766ea7a613c48f9145f79ccc15a Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 18 Mar 2024 11:38:13 +0100 Subject: [PATCH 51/61] add ref to opls in the docs --- docs/features.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/features.md b/docs/features.md index 3ed576ccd..1a5c272ac 100644 --- a/docs/features.md +++ b/docs/features.md @@ -168,9 +168,12 @@ These features relate to the structural relationship between nodes. #### Nonbond energies: -These features measure nonbond energy potentials between nodes. +These features measure nonbond energy potentials between nodes, and are calculated using [OPLS forcefield](https://en.wikipedia.org/wiki/OPLS). For residue graphs, the pairwise sum of potentials for all atoms from each residue is used. Note that no distance cutoff is used and the radius of influence is assumed to be infinite, although the potentials tends to 0 at large distance. Also edges are only assigned within a given cutoff radius when graphs are created. + Nonbond energies are set to 0 for any atom pairs (on the same chain) that are within a cutoff radius of 3.6 Å, as these are assumed to be covalent neighbors or linked by no more than 2 covalent bonds (i.e. 1-3 pairs). +Charge or vanderwaals parameters are set to 0 for those atoms that are unknown to the OPLS forcefield. + - `electrostatic`: Electrostatic potential (also known as Coulomb potential) between two nodes, calculated using interatomic distances and charges of each atom (float). - `vanderwaals`: Van der Waals potential (also known as Lennard-Jones potential) between two nodes, calculated using interatomic distance/s and a list of atoms with vanderwaals parameters (`deeprank2.domain.forcefield.protein-allhdg5-4_new`, float). Atom pairs within a cutoff radius of 4.2 Å (but above 3.6 Å) are assumed to be separated by separated by exactly 2 covalent bonds (i.e. 1-4 pairs) and use a set of lower energy parameters. From 158cf8714f75ad589e53a7be54222d8899e7ef90 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 18 Mar 2024 14:46:44 +0100 Subject: [PATCH 52/61] add link to the docs in the warnings --- deeprank2/utils/parsing/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/deeprank2/utils/parsing/__init__.py b/deeprank2/utils/parsing/__init__.py index 9f61c4dd3..f81b33f86 100644 --- a/deeprank2/utils/parsing/__init__.py +++ b/deeprank2/utils/parsing/__init__.py @@ -65,7 +65,10 @@ def get_vanderwaals_parameters(self, atom: Atom) -> VanderwaalsParam: type_ = action["TYPE"] if type_ is None: - _log.warning(f"Atom {atom} is unknown to OPLS, the forcefield used in deeprank2; vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0)") + _log.warning( + f"Atom {atom} is unknown to the forcefield, vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0).\n \ + Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", + ) return VanderwaalsParam(0.0, 0.0, 0.0, 0.0) return self._vanderwaals_parameters[type_] @@ -94,7 +97,10 @@ def get_charge(self, atom: Atom) -> float: charge = float(action["CHARGE"]) if charge is None: - _log.warning(f"Atom {atom} is unknown to OPLS, the forcefield used in deeprank2; charge is set to 0.0") + _log.warning( + f"Atom {atom} is unknown to the forcefield, charge is set to 0.0. \n \ + Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", + ) return 0.0 return charge From 13b9f0a63ae2a52a8c48eee35f414682c3d9dbcb Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 18 Mar 2024 15:11:46 +0100 Subject: [PATCH 53/61] add info about missing values --- docs/features.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/features.md b/docs/features.md index 1a5c272ac..fa475a6c0 100644 --- a/docs/features.md +++ b/docs/features.md @@ -173,7 +173,9 @@ For residue graphs, the pairwise sum of potentials for all atoms from each resid Nonbond energies are set to 0 for any atom pairs (on the same chain) that are within a cutoff radius of 3.6 Å, as these are assumed to be covalent neighbors or linked by no more than 2 covalent bonds (i.e. 1-3 pairs). -Charge or vanderwaals parameters are set to 0 for those atoms that are unknown to the OPLS forcefield. - - `electrostatic`: Electrostatic potential (also known as Coulomb potential) between two nodes, calculated using interatomic distances and charges of each atom (float). - `vanderwaals`: Van der Waals potential (also known as Lennard-Jones potential) between two nodes, calculated using interatomic distance/s and a list of atoms with vanderwaals parameters (`deeprank2.domain.forcefield.protein-allhdg5-4_new`, float). Atom pairs within a cutoff radius of 4.2 Å (but above 3.6 Å) are assumed to be separated by separated by exactly 2 covalent bonds (i.e. 1-4 pairs) and use a set of lower energy parameters. + +Charge and vanderwaals parameters are set to 0 for those atoms that are unknown to the OPLS forcefield, treating such cases as missing values. If this happens for many of the atoms in the PDB file/s provided, depending on the specific dataset it may be worth it to drop the features affected, i.e., `electrostatic`, `vanderwaals`, and `atom_charge`. + +- It may be useful to generate histograms of the processed data to further investigate the distribution of these features' values before deciding whether to drop them. Refer to the `data_generation_xxx.ipynb` tutorial files for comprehensive instructions on transforming the data into a Pandas dataframe and generating histograms of the features. From 1fc6f6310084991cba37add17cbc14ca76523366 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 18 Mar 2024 15:14:01 +0100 Subject: [PATCH 54/61] specify features affected --- deeprank2/utils/parsing/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deeprank2/utils/parsing/__init__.py b/deeprank2/utils/parsing/__init__.py index f81b33f86..00bed3ddc 100644 --- a/deeprank2/utils/parsing/__init__.py +++ b/deeprank2/utils/parsing/__init__.py @@ -67,7 +67,8 @@ def get_vanderwaals_parameters(self, atom: Atom) -> VanderwaalsParam: if type_ is None: _log.warning( f"Atom {atom} is unknown to the forcefield, vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0).\n \ - Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", + This will affect `vanderwaals` feature.\n \ + Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", ) return VanderwaalsParam(0.0, 0.0, 0.0, 0.0) return self._vanderwaals_parameters[type_] @@ -99,7 +100,8 @@ def get_charge(self, atom: Atom) -> float: if charge is None: _log.warning( f"Atom {atom} is unknown to the forcefield, charge is set to 0.0. \n \ - Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", + This will affect `electrostatic` and `atom_charge` features.\n \ + Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", ) return 0.0 return charge From f6f319b11933d13b9eb13693e44506f2299d9fab Mon Sep 17 00:00:00 2001 From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> Date: Tue, 19 Mar 2024 09:50:45 +0100 Subject: [PATCH 55/61] Update deeprank2/utils/parsing/__init__.py Co-authored-by: Dani Bodor --- deeprank2/utils/parsing/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deeprank2/utils/parsing/__init__.py b/deeprank2/utils/parsing/__init__.py index 00bed3ddc..466e470b1 100644 --- a/deeprank2/utils/parsing/__init__.py +++ b/deeprank2/utils/parsing/__init__.py @@ -66,9 +66,9 @@ def get_vanderwaals_parameters(self, atom: Atom) -> VanderwaalsParam: if type_ is None: _log.warning( - f"Atom {atom} is unknown to the forcefield, vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0).\n \ - This will affect `vanderwaals` feature.\n \ - Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", + f"Atom {atom} is unknown to the forcefield, vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0).\n" + " This will affect `vanderwaals` feature.\n" + " Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", ) return VanderwaalsParam(0.0, 0.0, 0.0, 0.0) return self._vanderwaals_parameters[type_] From c602dd54aeaea2df0f2dff6d3e4ca177a8ea387c Mon Sep 17 00:00:00 2001 From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> Date: Tue, 19 Mar 2024 09:50:50 +0100 Subject: [PATCH 56/61] Update deeprank2/utils/parsing/__init__.py Co-authored-by: Dani Bodor --- deeprank2/utils/parsing/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deeprank2/utils/parsing/__init__.py b/deeprank2/utils/parsing/__init__.py index 466e470b1..f22f4e52a 100644 --- a/deeprank2/utils/parsing/__init__.py +++ b/deeprank2/utils/parsing/__init__.py @@ -99,9 +99,9 @@ def get_charge(self, atom: Atom) -> float: if charge is None: _log.warning( - f"Atom {atom} is unknown to the forcefield, charge is set to 0.0. \n \ - This will affect `electrostatic` and `atom_charge` features.\n \ - Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", + f"Atom {atom} is unknown to the forcefield, `electrostatic` and `atom_charge` charge is set to 0.0.\n" + " This will affect `electrostatic` and `atom_charge` features.\n" + " Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", ) return 0.0 return charge From 53afaf02ea8b9caa83fc6565d0a314d169576cba Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 19 Mar 2024 09:51:53 +0100 Subject: [PATCH 57/61] add link to the tutorials --- docs/features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features.md b/docs/features.md index fa475a6c0..6fa59ea6b 100644 --- a/docs/features.md +++ b/docs/features.md @@ -178,4 +178,4 @@ Nonbond energies are set to 0 for any atom pairs (on the same chain) that are wi Charge and vanderwaals parameters are set to 0 for those atoms that are unknown to the OPLS forcefield, treating such cases as missing values. If this happens for many of the atoms in the PDB file/s provided, depending on the specific dataset it may be worth it to drop the features affected, i.e., `electrostatic`, `vanderwaals`, and `atom_charge`. -- It may be useful to generate histograms of the processed data to further investigate the distribution of these features' values before deciding whether to drop them. Refer to the `data_generation_xxx.ipynb` tutorial files for comprehensive instructions on transforming the data into a Pandas dataframe and generating histograms of the features. +- It may be useful to generate histograms of the processed data to further investigate the distribution of these features' values before deciding whether to drop them. Refer to the `data_generation_xxx.ipynb` [tutorial files](https://github.com/DeepRank/deeprank2/tree/main/tutorials) for comprehensive instructions on transforming the data into a Pandas dataframe and generating histograms of the features. From 1d1cca5933167681e0589e92a1f8e4175238a20b Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 19 Mar 2024 10:07:06 +0100 Subject: [PATCH 58/61] add version and format citation.cff --- CITATION.cff | 81 ++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 0fc0a42aa..b3abe5e6b 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,45 +1,5 @@ cff-version: "1.2.0" authors: -- family-names: Crocioni - given-names: Giulia - orcid: "https://orcid.org/0000-0002-0823-0121" -- family-names: Bodor - given-names: Dani L. - orcid: "https://orcid.org/0000-0003-2109-2349" -- family-names: Baakman - given-names: Coos - orcid: "https://orcid.org/0000-0003-4317-1566" -- family-names: Parizi - given-names: Farzaneh M. - orcid: "https://orcid.org/0000-0003-4230-7492" -- family-names: Rademaker - given-names: Daniel-T. - orcid: "https://orcid.org/0000-0003-1959-1317" -- family-names: Ramakrishnan - given-names: Gayatri - orcid: "https://orcid.org/0000-0001-8203-2783" -- family-names: Burg - given-names: Sven A. - name-particle: van der - orcid: "https://orcid.org/0000-0003-1250-6968" -- family-names: Marzella - given-names: Dario F. - orcid: "https://orcid.org/0000-0002-0043-3055" -- family-names: Teixeira - given-names: João M. C. - orcid: "https://orcid.org/0000-0002-9113-0622" -- family-names: Xue - given-names: Li C. - orcid: "https://orcid.org/0000-0002-2613-538X" -contact: -- family-names: Crocioni - given-names: Giulia - orcid: "https://orcid.org/0000-0002-0823-0121" -doi: 10.5281/zenodo.10566809 -message: If you use this software, please cite our article in the - Journal of Open Source Software. -preferred-citation: - authors: - family-names: Crocioni given-names: Giulia orcid: "https://orcid.org/0000-0002-0823-0121" @@ -71,6 +31,46 @@ preferred-citation: - family-names: Xue given-names: Li C. orcid: "https://orcid.org/0000-0002-2613-538X" +contact: + - family-names: Crocioni + given-names: Giulia + orcid: "https://orcid.org/0000-0002-0823-0121" +doi: 10.5281/zenodo.10566809 +message: If you use this software, please cite our article in the + Journal of Open Source Software. +preferred-citation: + authors: + - family-names: Crocioni + given-names: Giulia + orcid: "https://orcid.org/0000-0002-0823-0121" + - family-names: Bodor + given-names: Dani L. + orcid: "https://orcid.org/0000-0003-2109-2349" + - family-names: Baakman + given-names: Coos + orcid: "https://orcid.org/0000-0003-4317-1566" + - family-names: Parizi + given-names: Farzaneh M. + orcid: "https://orcid.org/0000-0003-4230-7492" + - family-names: Rademaker + given-names: Daniel-T. + orcid: "https://orcid.org/0000-0003-1959-1317" + - family-names: Ramakrishnan + given-names: Gayatri + orcid: "https://orcid.org/0000-0001-8203-2783" + - family-names: Burg + given-names: Sven A. + name-particle: van der + orcid: "https://orcid.org/0000-0003-1250-6968" + - family-names: Marzella + given-names: Dario F. + orcid: "https://orcid.org/0000-0002-0043-3055" + - family-names: Teixeira + given-names: João M. C. + orcid: "https://orcid.org/0000-0002-9113-0622" + - family-names: Xue + given-names: Li C. + orcid: "https://orcid.org/0000-0002-2613-538X" date-published: 2024-02-27 doi: 10.21105/joss.05983 issn: 2475-9066 @@ -86,3 +86,4 @@ preferred-citation: volume: 9 title: "DeepRank2: Mining 3D Protein Structures with Geometric Deep Learning" +version: 3.0.1 From 05b91be8e333c10af225b3be54e42fd4632083e3 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 19 Mar 2024 10:08:04 +0100 Subject: [PATCH 59/61] add quotation marks to the version --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index b3abe5e6b..f5cc09a32 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -86,4 +86,4 @@ preferred-citation: volume: 9 title: "DeepRank2: Mining 3D Protein Structures with Geometric Deep Learning" -version: 3.0.1 +version: "3.0.1" From 689643dac3a87339e310680d9efc3a223288cb72 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 19 Mar 2024 10:09:10 +0100 Subject: [PATCH 60/61] bump patch version --- .bumpversion.cfg | 2 +- CITATION.cff | 2 +- deeprank2/__init__.py | 2 +- pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 26f291b47..94fe0bfc3 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.1 +current_version = 3.0.2 [comment] comment = The contents of this file cannot be merged with that of setup.cfg until https://github.com/c4urself/bump2version/issues/185 is resolved diff --git a/CITATION.cff b/CITATION.cff index f5cc09a32..981a89d16 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -86,4 +86,4 @@ preferred-citation: volume: 9 title: "DeepRank2: Mining 3D Protein Structures with Geometric Deep Learning" -version: "3.0.1" +version: "3.0.2" diff --git a/deeprank2/__init__.py b/deeprank2/__init__.py index 055276878..131942e76 100644 --- a/deeprank2/__init__.py +++ b/deeprank2/__init__.py @@ -1 +1 @@ -__version__ = "3.0.1" +__version__ = "3.0.2" diff --git a/pyproject.toml b/pyproject.toml index 7978fdbfa..e2e666750 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "deeprank2" -version = "3.0.1" +version = "3.0.2" description = "DeepRank2 is an open-source deep learning framework for data mining of protein-protein interfaces or single-residue missense variants." readme = "README.md" requires-python = ">=3.10" From 54dc82897f59b190d4b2b7215fafebb1b3c5e891 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 19 Mar 2024 10:17:33 +0100 Subject: [PATCH 61/61] remove tests folder from the distribution --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e2e666750..5ecc23d9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ source = ["deeprank2"] [tool.setuptools.packages.find] include = ["deeprank2*"] -exclude = ["tests*", "*tests.*", "*tests"] +exclude = ["tests", "tests*", "*tests.*", "*tests"] [tool.setuptools.package-data] "*" = ["*.xlsx", "*.param", "*.top", "*residue-classes"]