BorgwardtLab · pjhartout · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/config/environment.yml b/config/environment.yml
@@ -0,0 +1,36 @@
+name: esmfold
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+dependencies:
+  - conda-forge::python=3.7
+  - conda-forge::setuptools=59.5.0
+  - conda-forge::pip
+  - conda-forge::openmm=7.5.1
+  - conda-forge::pdbfixer
+  - conda-forge::cudatoolkit==11.3.*
+  - conda-forge::einops
+  - conda-forge::fairscale
+  - conda-forge::omegaconf
+  - conda-forge::hydra-core
+  - conda-forge::pandas
+  - conda-forge::pytest
+  - bioconda::hmmer==3.3.2
+  - bioconda::hhsuite==3.3.0
+  - bioconda::kalign2==2.04
+  - pytorch::pytorch=1.12.*
+  - pip:
+      - biopython==1.79
+      - deepspeed==0.5.9
+      - dm-tree==0.1.6
+      - ml-collections==0.1.0
+      - numpy==1.21.2
+      - PyYAML==5.4.1
+      - requests==2.26.0
+      - scipy==1.7.1
+      - tqdm==4.62.2
+      - typing-extensions==3.10.0.2
+      - pytorch_lightning==1.5.10
+      - wandb==0.12.21
+      - git+https://github.com/NVIDIA/dllogger.git
diff --git a/config/esmfold_config.yaml b/config/esmfold_config.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+  - dataset: gearnet_ec
+
+task:
+  class: MultipleBinaryClassification
+  model:
+    output_dim: 0
+  graph_construction_model:
+    class: GraphConstruction
+    node_layers:
+      - class: AlphaCarbonNode
+    edge_layers:
+      - class: SequentialEdge
+  criterion: bce
+  num_mlp_layer: 0
+  metric: ['auprc@micro', 'f1_max']
+
+compute:
+  array_parallelism: 10
+  cpus_per_task: 5
+  mem_per_cpu: 2g
+  timeout_min: 1440
+  job_name: esmfold
+  partition: p.hpcl94g
+  gpus_per_node: 1
+  gpus_per_task: 1
+  tasks_per_node: 1
+
+data:
+  batch_size: 20
+  fold_batch_size: 10
+
+seed: 1234
+model: pst_t6
+use_edge_attr: false
+datapath: datasets
+nogpu: false
+metric: 'f1_max'
+batch_size: 4
+num_workers: 4
+device: null
+
+truncation_seq_length: 5000
+toks_per_batch: 4096
+include_seq: false
+aggr: concat
+use_pca: null
+parallel: false
+pretrained: .cache/pst
+
+logs:
+  prefix: logs_pst/gearnet_data
+  path: ${logs.prefix}/${dataset.name}/${model}/${aggr}/${include_seq}/${seed}
+
+# output directory, generated dynamically on each run
+hydra:
+  run:
+    dir: ${logs.path}
+  sweep:
+    dir: ${logs.prefix}/${dataset.name}/${model}/${aggr}/${include_seq}/${seed}
+    subdir: ""
diff --git a/config/hydra/launcher/slurm.yaml b/config/hydra/launcher/slurm.yaml
@@ -0,0 +1,27 @@
+# @package hydra.launcher
+_target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher
+
+# SLURM queue parameters
+# partition: p.hpcl8
+partition: p.hpcl91
+# partition: p.hpcl91
+
+# Job resource requirements
+timeout_min: 14_400
+cpus_per_task: 12
+gpus_per_node: 1
+gpus_per_task: 1
+tasks_per_node: 1
+mem_per_cpu: 2g
+nodes: 1
+exclude: hpcl9101
+
+# Job naming and output
+name: "pst"
+submitit_folder: ./logs/submitit/%j
+
+# Additional settings
+signal_delay_s: 5
+max_num_timeout: 0
+additional_parameters: {}
+array_parallelism: 30
diff --git a/config/pst_edge_perturb.yaml b/config/pst_edge_perturb.yaml
@@ -0,0 +1,55 @@
+defaults:
+  - _self_
+  - training: default
+  - base_model: esm2_t6
+  - mode: default
+
+debug: false
+seed: 1234
+
+data:
+  organism: swissprot
+  datapath: datasets/AlphaFold/${data.organism}
+  graph_eps: 8.0
+  crop_len: 1024
+  mask_rate: 0.15
+  val_datapath: datasets/dms
+  edge_perturb: random
+
+compute:
+  accelerator: gpu
+  precision: 16-mixed
+  strategy: ddp
+  num_workers: 8
+  n_jobs: 10
+  devices: auto
+
+logs:
+  prefix: logs_pst/random_ablation/edge_perturb_${data.edge_perturb}
+  path: ${logs.prefix}/${model.name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
+  wandb: 
+    enable: true
+    name: ${model.name}_edge_perturb_${data.edge_perturb}
+    tags:
+      - random_ablation
+      - organism_${data.organism}
+      - model_${model.name}
+      - edge_perturb_${data.edge_perturb}
+    entity: "BorgwardtLab"
+    project: "PST"
+    save_dir: ${logs.path}
+
+model:
+  k_hop: 2
+  train_struct_only: true
+  use_edge_attr: false
+  gnn_type: gin
+  edge_dim: null
+
+# output directory, generated dynamically on each run
+hydra:
+  run:
+    dir: ${logs.path}
+  sweep:
+    dir: ${logs.prefix}/${model.name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
diff --git a/config/pst_gearnet.yaml b/config/pst_gearnet.yaml
@@ -16,6 +16,9 @@ task:
   num_mlp_layer: 0
   metric: ['auprc@micro', 'f1_max']
 
+data: 
+  edge_perturb: null
+
 seed: 1234
 model: pst_t6
 use_edge_attr: false

diff --git a/config/pst_gearnet_esmfold.yaml b/config/pst_gearnet_esmfold.yaml
@@ -0,0 +1,51 @@
+defaults:
+  - _self_
+  - dataset: gearnet_ec
+
+task:
+  class: MultipleBinaryClassification
+  model:
+    output_dim: 0
+  graph_construction_model:
+    class: GraphConstruction
+    node_layers:
+      - class: AlphaCarbonNode
+    edge_layers:
+      - class: SequentialEdge
+  criterion: bce
+  num_mlp_layer: 0
+  metric: ['auprc@micro', 'f1_max']
+
+data: 
+  edge_perturb: null
+  esmfold_structures_path: datasets/esmfold/structures/
+
+seed: 1234
+model: pst_t6
+use_edge_attr: false
+datapath: datasets
+nogpu: false
+metric: 'f1_max'
+batch_size: 4
+num_workers: 4
+device: null
+
+truncation_seq_length: 5000
+toks_per_batch: 4096
+include_seq: false
+aggr: concat
+use_pca: null
+
+pretrained: .cache/pst
+
+logs:
+  prefix: logs_pst/gearnet_data
+  path: ${logs.prefix}/${dataset.name}/${model}/${aggr}/${include_seq}/${seed}
+
+# output directory, generated dynamically on each run
+hydra:
+  run:
+    dir: ${logs.path}
+  sweep:
+    dir: ${logs.prefix}/${dataset.name}/${model}/${aggr}/${include_seq}/${seed}
+    subdir: ""
diff --git a/config/pst_gearnet_finetune.yaml b/config/pst_gearnet_finetune.yaml
@@ -39,6 +39,7 @@ compute:
   strategy: auto
   num_workers: 8
   n_jobs: 10
+  devices: auto
 
 pretrained: .cache/pst
 

diff --git a/config/pst_pretrain.yaml b/config/pst_pretrain.yaml
@@ -21,6 +21,7 @@ compute:
   strategy: ddp
   num_workers: 8
   n_jobs: 10
+  devices: auto
 
 logs:
   prefix: logs_pst/pretrain

diff --git a/config/pst_proteinshake.yaml b/config/pst_proteinshake.yaml
@@ -10,16 +10,16 @@ split: structure
 batch_size: 4
 num_workers: 4
 device: null
-
 truncation_seq_length: 5000
 toks_per_batch: 4096
 include_seq: false
-
 pretrained: .cache/pst
+perturbation: null
+model_path: null
 
 logs:
   prefix: logs_pst/proteinshake
-  path: ${logs.prefix}/${task.name}/${model}/${split}/${seed}
+  path: ${logs.prefix}/${task.name}/${model}/${split}/${seed}/${perturbation}
 
 # output directory, generated dynamically on each run
 hydra: