Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add edge randomization ablation configuration #3

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions config/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: esmfold
channels:
- conda-forge
- bioconda
- pytorch
dependencies:
- conda-forge::python=3.7
- conda-forge::setuptools=59.5.0
- conda-forge::pip
- conda-forge::openmm=7.5.1
- conda-forge::pdbfixer
- conda-forge::cudatoolkit==11.3.*
- conda-forge::einops
- conda-forge::fairscale
- conda-forge::omegaconf
- conda-forge::hydra-core
- conda-forge::pandas
- conda-forge::pytest
- bioconda::hmmer==3.3.2
- bioconda::hhsuite==3.3.0
- bioconda::kalign2==2.04
- pytorch::pytorch=1.12.*
- pip:
- biopython==1.79
- deepspeed==0.5.9
- dm-tree==0.1.6
- ml-collections==0.1.0
- numpy==1.21.2
- PyYAML==5.4.1
- requests==2.26.0
- scipy==1.7.1
- tqdm==4.62.2
- typing-extensions==3.10.0.2
- pytorch_lightning==1.5.10
- wandb==0.12.21
- git+https://github.com/NVIDIA/dllogger.git
62 changes: 62 additions & 0 deletions config/esmfold_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
defaults:
- _self_
- dataset: gearnet_ec

task:
class: MultipleBinaryClassification
model:
output_dim: 0
graph_construction_model:
class: GraphConstruction
node_layers:
- class: AlphaCarbonNode
edge_layers:
- class: SequentialEdge
criterion: bce
num_mlp_layer: 0
metric: ['auprc@micro', 'f1_max']

compute:
array_parallelism: 10
cpus_per_task: 5
mem_per_cpu: 2g
timeout_min: 1440
job_name: esmfold
partition: p.hpcl94g
gpus_per_node: 1
gpus_per_task: 1
tasks_per_node: 1

data:
batch_size: 20
fold_batch_size: 10

seed: 1234
model: pst_t6
use_edge_attr: false
datapath: datasets
nogpu: false
metric: 'f1_max'
batch_size: 4
num_workers: 4
device: null

truncation_seq_length: 5000
toks_per_batch: 4096
include_seq: false
aggr: concat
use_pca: null
parallel: false
pretrained: .cache/pst

logs:
prefix: logs_pst/gearnet_data
path: ${logs.prefix}/${dataset.name}/${model}/${aggr}/${include_seq}/${seed}

# output directory, generated dynamically on each run
hydra:
run:
dir: ${logs.path}
sweep:
dir: ${logs.prefix}/${dataset.name}/${model}/${aggr}/${include_seq}/${seed}
subdir: ""
27 changes: 27 additions & 0 deletions config/hydra/launcher/slurm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# @package hydra.launcher
_target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher

# SLURM queue parameters
# partition: p.hpcl8
partition: p.hpcl91
# partition: p.hpcl91

# Job resource requirements
timeout_min: 14_400
cpus_per_task: 12
gpus_per_node: 1
gpus_per_task: 1
tasks_per_node: 1
mem_per_cpu: 2g
nodes: 1
exclude: hpcl9101

# Job naming and output
name: "pst"
submitit_folder: ./logs/submitit/%j

# Additional settings
signal_delay_s: 5
max_num_timeout: 0
additional_parameters: {}
array_parallelism: 30
55 changes: 55 additions & 0 deletions config/pst_edge_perturb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
defaults:
- _self_
- training: default
- base_model: esm2_t6
- mode: default

debug: false
seed: 1234

data:
organism: swissprot
datapath: datasets/AlphaFold/${data.organism}
graph_eps: 8.0
crop_len: 1024
mask_rate: 0.15
val_datapath: datasets/dms
edge_perturb: random

compute:
accelerator: gpu
precision: 16-mixed
strategy: ddp
num_workers: 8
n_jobs: 10
devices: auto

logs:
prefix: logs_pst/random_ablation/edge_perturb_${data.edge_perturb}
path: ${logs.prefix}/${model.name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
wandb:
enable: true
name: ${model.name}_edge_perturb_${data.edge_perturb}
tags:
- random_ablation
- organism_${data.organism}
- model_${model.name}
- edge_perturb_${data.edge_perturb}
entity: "BorgwardtLab"
project: "PST"
save_dir: ${logs.path}

model:
k_hop: 2
train_struct_only: true
use_edge_attr: false
gnn_type: gin
edge_dim: null

# output directory, generated dynamically on each run
hydra:
run:
dir: ${logs.path}
sweep:
dir: ${logs.prefix}/${model.name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
subdir: ${hydra.job.num}
3 changes: 3 additions & 0 deletions config/pst_gearnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ task:
num_mlp_layer: 0
metric: ['auprc@micro', 'f1_max']

data:
edge_perturb: null

seed: 1234
model: pst_t6
use_edge_attr: false
Expand Down
51 changes: 51 additions & 0 deletions config/pst_gearnet_esmfold.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
defaults:
- _self_
- dataset: gearnet_ec

task:
class: MultipleBinaryClassification
model:
output_dim: 0
graph_construction_model:
class: GraphConstruction
node_layers:
- class: AlphaCarbonNode
edge_layers:
- class: SequentialEdge
criterion: bce
num_mlp_layer: 0
metric: ['auprc@micro', 'f1_max']

data:
edge_perturb: null
esmfold_structures_path: datasets/esmfold/structures/

seed: 1234
model: pst_t6
use_edge_attr: false
datapath: datasets
nogpu: false
metric: 'f1_max'
batch_size: 4
num_workers: 4
device: null

truncation_seq_length: 5000
toks_per_batch: 4096
include_seq: false
aggr: concat
use_pca: null

pretrained: .cache/pst

logs:
prefix: logs_pst/gearnet_data
path: ${logs.prefix}/${dataset.name}/${model}/${aggr}/${include_seq}/${seed}

# output directory, generated dynamically on each run
hydra:
run:
dir: ${logs.path}
sweep:
dir: ${logs.prefix}/${dataset.name}/${model}/${aggr}/${include_seq}/${seed}
subdir: ""
1 change: 1 addition & 0 deletions config/pst_gearnet_finetune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ compute:
strategy: auto
num_workers: 8
n_jobs: 10
devices: auto

pretrained: .cache/pst

Expand Down
1 change: 1 addition & 0 deletions config/pst_pretrain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ compute:
strategy: ddp
num_workers: 8
n_jobs: 10
devices: auto

logs:
prefix: logs_pst/pretrain
Expand Down
6 changes: 3 additions & 3 deletions config/pst_proteinshake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ split: structure
batch_size: 4
num_workers: 4
device: null

truncation_seq_length: 5000
toks_per_batch: 4096
include_seq: false

pretrained: .cache/pst
perturbation: null
model_path: null

logs:
prefix: logs_pst/proteinshake
path: ${logs.prefix}/${task.name}/${model}/${split}/${seed}
path: ${logs.prefix}/${task.name}/${model}/${split}/${seed}/${perturbation}

# output directory, generated dynamically on each run
hydra:
Expand Down
Loading