ESMfold code

ESMFold
for more details see: Github, Preprint

Tips and Instructions
click the little ▶ play icon to the left of each cell below.
use "/" to specify chainbreaks, (eg. sequence="AAA/AAA")
for homo-oligomeric predictions, set copies > 1
See experimental notebook for more advanced options (like sampling).
Colab Limitations
For short monomeric proteins under the length 400, consider using ESMFold API (no need for GPU, super fast!)
On Tesla T4 (typical free colab GPU), max total length ~ 900
%%time
#@title install
#@markdown install ESMFold, OpenFold and download Params (~2min 30s)

import os, time
if not os.path.isfile("esmfold.model"):
  # download esmfold params
  os.system("apt-get install aria2 -qq")
  os.system("aria2c -q -x 16 https://colabfold.steineggerlab.workers.dev/esm/esmfold.model &")

  # install libs
  os.system("pip install -q omegaconf pytorch_lightning biopython ml_collections einops py3Dmol")
  os.system("pip install -q git+https://github.com/NVIDIA/dllogger.git")

  # install openfold
  commit = "6908936b68ae89f67755240e2f588c09ec31d4c8"
  os.system(f"pip install -q git+https://github.com/aqlaboratory/openfold.git@{commit}")

  # install esmfold
  os.system(f"pip install -q git+https://github.com/sokrypton/esm.git")

  # wait for Params to finish downloading...
  if not os.path.isfile("esmfold.model"):
    # backup source!
    os.system("aria2c -q -x 16 https://files.ipd.uw.edu/pub/esmfold/esmfold.model")
  else:
    while os.path.isfile("esmfold.model.aria2"):
      time.sleep(5)

from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1. ACE inhibitory activity/AHT_train.csv')


#@title ##1. Peptide Properties by **ESMFold**
%%time
from string import ascii_uppercase, ascii_lowercase
import hashlib, re, os
import numpy as np
from jax.tree_util import tree_map
import matplotlib.pyplot as plt
from scipy.special import softmax
import csv

def parse_output(output):
  pae = (output["aligned_confidence_probs"][0] * np.arange(64)).mean(-1) * 31
  plddt = output["plddt"][0,:,1]

  bins = np.append(0,np.linspace(2.3125,21.6875,63))
  sm_contacts = softmax(output["distogram_logits"],-1)[0]
  sm_contacts = sm_contacts[...,bins<8].sum(-1)
  xyz = output["positions"][-1,0,:,1]
  mask = output["atom37_atom_exists"][0,:,1] == 1
  o = {"pae":pae[mask,:][:,mask],
       "plddt":plddt[mask],
       "sm_contacts":sm_contacts[mask,:][:,mask],
       "xyz":xyz[mask]}
  return o

def get_hash(x): return hashlib.sha1(x.encode()).hexdigest()
alphabet_list = list(ascii_uppercase+ascii_lowercase)

jobname = "test" #@param {type:"string"}
jobname = re.sub(r'\W+', '', jobname)[:50]

# Path of CSV file
csv_file_path = "/content/drive/MyDrive/Colab Notebooks/1. ACE inhibitory activity/AHT_train.csv"

#reading the csv file
sequences = []
with open(csv_file_path, "r") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        sequence = row[0]  # Assuming the sequence is in the first column
        sequence = re.sub("[^A-Z]", "", sequence.replace("/", "").upper())  # Remove non-alphabetic characters
        sequences.append(sequence)


# run the loop for all peptides
counter = 1

for sequence in sequences:
    num_recycles = 3 #@param ["0", "1", "2", "3", "6", "12", "24"] {type:"raw"}
    chain_linker = 25

    ID = jobname+"_"+get_hash(sequence)[:5]
    seqs = sequence.split(":")
    lengths = [len(s) for s in seqs]
    length = sum(lengths)
    print("Serial:", counter)
    print("Length:",length)
    print("Sequence:", sequence)

    u_seqs = list(set(seqs))
    if len(seqs) == 1: mode = "mono"
    elif len(u_seqs) == 1: mode = "homo"
    else: mode = "hetero"

    if "model" not in dir():
      import torch
      model = torch.load("esmfold.model")
      model.eval().cuda().requires_grad_(False)

    # optimized for Tesla T4
    if length > 700:
      model.set_chunk_size(64)
    else:
      model.set_chunk_size(128)

    torch.cuda.empty_cache()
    output = model.infer(sequence,
                        num_recycles=num_recycles,
                        chain_linker="X"*chain_linker,
                        residue_index_offset=512)

    pdb_str = model.output_to_pdb(output)[0]
    output = tree_map(lambda x: x.cpu().numpy(), output)
    ptm = output["ptm"][0]
    plddt = output["plddt"][0,...,1].mean()
    O = parse_output(output)
    print(f'ptm: {ptm:.3f}')
    print(f'plddt: {plddt:.3f}')
    print(" ")
    os.system(f"mkdir -p {ID}")
    prefix = f"{ID}/ptm{ptm:.3f}_r{num_recycles}_default"
    np.savetxt(f"{prefix}.pae.txt",O["pae"],"%.3f")
    with open(f"{prefix}.pdb","w") as out:
      out.write(pdb_str)

    counter += 1


#@title 2. 3D Structure  {run: "auto"}
import py3Dmol
pymol_color_list = ["#33ff33","#00ffff","#ff33cc","#ffff00","#ff9999","#e5e5e5","#7f7fff","#ff7f00",
                    "#7fff7f","#199999","#ff007f","#ffdd5e","#8c3f99","#b2b2b2","#007fff","#c4b200",
                    "#8cb266","#00bfbf","#b27f7f","#fcd1a5","#ff7f7f","#ffbfdd","#7fffff","#ffff7f",
                    "#00ff7f","#337fcc","#d8337f","#bfff3f","#ff7fff","#d8d8ff","#3fffbf","#b78c4c",
                    "#339933","#66b2b2","#ba8c84","#84bf00","#b24c66","#7f7f7f","#3f3fa5","#a5512b"]


def show_pdb(pdb_str, sequence, show_sidechains=False, show_mainchains=False,
             color="pLDDT", chains=None, vmin=50, vmax=90,
             size=(800,480), hbondCutoff=4.0,
             Ls=None,
             animate=False):

  if chains is None:
    chains = 1 if Ls is None else len(Ls)
  view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=size[0], height=size[1])
  if animate:
    view.addModelsAsFrames(pdb_str,'pdb',{'hbondCutoff':hbondCutoff})
  else:
    view.addModel(pdb_str,'pdb',{'hbondCutoff':hbondCutoff})
  if color == "pLDDT":
    view.setStyle({'cartoon': {'colorscheme': {'prop':'b','gradient': 'roygb','min':vmin,'max':vmax}}})
  elif color == "rainbow":
    view.setStyle({'cartoon': {'color':'spectrum'}})
  elif color == "chain":
    for n,chain,color in zip(range(chains),alphabet_list,pymol_color_list):
       view.setStyle({'chain':chain},{'cartoon': {'color':color}})
  if show_sidechains:
    BB = ['C','O','N']
    view.addStyle({'and':[{'resn':["GLY","PRO"],'invert':True},{'atom':BB,'invert':True}]},
                  {'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
    view.addStyle({'and':[{'resn':"GLY"},{'atom':'CA'}]},
                  {'sphere':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
    view.addStyle({'and':[{'resn':"PRO"},{'atom':['C','O'],'invert':True}]},
                  {'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
  if show_mainchains:
    BB = ['C','O','N','CA']
    view.addStyle({'atom':BB},{'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
  view.zoomTo()
  if animate: view.animate()

  return view

color = "confidence" #@param ["confidence", "rainbow", "chain"]
if color == "confidence": color = "pLDDT"
show_sidechains = False #@param {type:"boolean"}
show_mainchains = False #@param {type:"boolean"}

# run the loop for all peptides
counter = 1

for sequence in sequences:
    num_recycles = 3 #@param ["0", "1", "2", "3", "6", "12", "24"] {type:"raw"}
    chain_linker = 25

    ID = jobname+"_"+get_hash(sequence)[:5]
    seqs = sequence.split(":")
    lengths = [len(s) for s in seqs]
    length = sum(lengths)
    print("Serial:", counter)
    print("Length:",length)
    print("Sequence:", sequence)

    u_seqs = list(set(seqs))
    if len(seqs) == 1: mode = "mono"
    elif len(u_seqs) == 1: mode = "homo"
    else: mode = "hetero"

    if "model" not in dir():
      import torch
      model = torch.load("esmfold.model")
      model.eval().cuda().requires_grad_(False)

    # optimized for Tesla T4
    if length > 700:
      model.set_chunk_size(64)
    else:
      model.set_chunk_size(128)

    torch.cuda.empty_cache()
    output = model.infer(sequence,
                        num_recycles=num_recycles,
                        chain_linker="X"*chain_linker,
                        residue_index_offset=512)

    pdb_str = model.output_to_pdb(output)[0]
    output = tree_map(lambda x: x.cpu().numpy(), output)
    ptm = output["ptm"][0]
    plddt = output["plddt"][0,...,1].mean()
    O = parse_output(output)
    print(f'ptm: {ptm:.3f}')
    print(f'plddt: {plddt:.3f}')
    print("3D Structure: ")
    os.system(f"mkdir -p {ID}")
    prefix = f"{ID}/ptm{ptm:.3f}_r{num_recycles}_default"
    np.savetxt(f"{prefix}.pae.txt",O["pae"],"%.3f")
    # Save the PDB file to Google Drive
    pdb_drive_directory = "/content/drive/MyDrive/Colab Notebooks/1. ACE inhibitory activity/PDB output"
    pdb_drive_filename = os.path.join(pdb_drive_directory, f"{prefix}.pdb")
    with open(f"{prefix}.pdb","w") as out:
      out.write(pdb_str)

    counter += 1
    # Show the 3D structure visualization
    show_pdb(pdb_str, sequence, color=color,
              show_sidechains=show_sidechains,
              show_mainchains=show_mainchains,
              Ls=lengths).show()

    # Create a download link for the PDB file
    print("Download PDB file:")
    print(f"[Download PDB File](sandbox:/content/drive/MyDrive/Colab Notebooks/1. ACE inhibitory activity/PDB output/{prefix}.pdb)")
    print(" ")