Skip to content

Commit

Permalink
Merge pull request #29 from Oufan75/master
Browse files Browse the repository at this point in the history
Add rotamer libraries for phosphorated residues and functions to preserve original sidechains
  • Loading branch information
JerryJohnsonLee authored Jul 25, 2023
2 parents 1794566 + 02b107f commit 56da7cf
Show file tree
Hide file tree
Showing 38 changed files with 2,357 additions and 584 deletions.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
.DS_Store

# C extensions
*.so
Expand Down Expand Up @@ -112,3 +113,12 @@ venv.bak/

# Dunbrack library files
SimpleOpt1-5/

# in development files
src/mcsce/core/check_library.py
src/mcsce/*.pdb
src/mcsce/*_mcsce/
src/mcsce/*.csv
src/mcsce/test_pdbs/
src/mcsce/core/data/glycam_06j.xml
src/mcsce/core/sidechain_templates/cap_templates/
24 changes: 23 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,29 @@ MCSCE - Sidechain packing library
Monte Carlo Side Chain Entropy package for generating side chain packing for
fixed protein backbone.

v0.0.0
Updated supports for phosphorated residues and other listed modifications; other ptms in development.

Phosphoralytion(unprotonated, protonated)

- SER: SEP S1P
- THR: TPO T1P
- TYR: PTR Y1P
- HID: H2D H1D
- HIE: H2E H1E

Methylation

- LYS: M3L

N6-carboxylysine

- LYS: KCX

Hydroxylation

- PRO: HYP

v0.1.0

References
==========
Expand Down
88 changes: 69 additions & 19 deletions src/mcsce/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from datetime import datetime

from numpy import mod
from numpy import mod, any, array
from tqdm.std import tqdm

from mcsce.libs.libstructure import Structure
Expand All @@ -17,7 +17,9 @@
parser.add_argument("-s", "--same_structure", action="store_true", default=False, help="When generating PDB files in a folder, whether each structure in the folder has the same amino acid sequence. When this is set to True, the energy function preparation will be done only once.")
parser.add_argument("-b", "--batch_size", default=16, type=int, help="The batch size used for calculating energies for conformations. Consider decreasing batch size when encountered OOM in building.")
parser.add_argument("-m", "--mode", choices=["ensemble", "simple", "exhaustive"], default="ensemble", help="This option controls whether a structural ensemble or just a single structure is created for every input structure. The default behavior is to create a structural ensemble. Simple/exhaustive modes are for creating single structure. In simple mode, n_conf structures are tried sequentially and the first valid structure is returned, and in exhaustive mode, a total number of n_conf structures will be created and the lowest energy conformation is returned.")
parser.add_argument("-f", "--fix", default=None, help="list of residue ids whose side chains are retained. Specify start to stop (inclusive) with dash, and seperate id ranges with plus. E.g. 2-5+10-13. Only supports same structure mode and input structure should contain original side chains for residues to be fixed. Currently assumes continuous residue ids.")
parser.add_argument("-l", "--logfile", default="log.csv", help="File name of the log file")
parser.add_argument("-v", "--verbose", default=False, help="whether to print out warnings")


def load_args(parser):
Expand All @@ -35,10 +37,10 @@ def maincli():
"""Independent client entry point."""
cli(parser, main)

def read_structure_and_check(file_name):
def read_structure_and_check(file_name, retain_idx=[], verbose=False):
"""
Helper function for reading a structure from a given filename and returns the structure object
Also checks whether there are missing atoms in the structure
checks whether there are missing atoms in the structure and
"""
s = Structure(file_name)
s.build()
Expand All @@ -50,21 +52,27 @@ def read_structure_and_check(file_name):
res_labels = s.res_labels
res_labels[res_labels == "HIS"] = "HIP"
s.res_labels = res_labels
print(message)
if verbose:
print(message)
else:
print("HIS residues modified to HIP\n")
missing_backbone_atoms = s.check_backbone_atom_completeness()
if len(missing_backbone_atoms) > 0:
message = f"WARNING! These atoms are missing from the current backbone structure [{file_name}]:"
for resid, atom_name in missing_backbone_atoms:
message += f"\n{resid} {atom_name}"
print(message + "\n")
s = s.remove_side_chains()
if verbose:
print(message + "\n")
else:
print("WARNING! %d missing atoms in total"%(len(message.split("\n")) - 1))
s = s.remove_side_chains(retain_idx)
return s





def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_size=4, same_structure=False):
def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, fix, batch_size=4, same_structure=False, verbose=False):

# antipattern to save time
from mcsce.core.side_chain_builder import initialize_func_calc, create_side_chain_ensemble, create_side_chain
Expand All @@ -78,15 +86,26 @@ def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_siz
f.write("PDB name,Succeeded,Time used\n")

ff = forcefields["Amberff14SB"]
ff_obj = ff(add_OXT=True, add_Nterminal_H=True)

ff_obj = ff(Cterminal='OXT', Nterminal='HN')

if isinstance(fix, str):
if fix.find('-') + fix.find('+') <= -2 :
print('--fix argument syntax error. Abort')
return
elif not same_structure:
print('--fix argument ignored')
if not isinstance(fix, str) and fix is not None:
print('--fix argument syntax error. Abort')
return

if n_worker is None:
import multiprocessing
n_worker = multiprocessing.cpu_count() - 1
else:
n_worker = n_worker
print("# workers:", n_worker)
print("mode: ", mode)
if verbose:
print("# workers:", n_worker)
print("mode: ", mode)

if input_structure[-3:].upper() == "PDB":
all_pdbs = [input_structure]
Expand All @@ -95,20 +114,49 @@ def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_siz
if not input_folder.endswith("/"):
input_folder += "/"
all_pdbs = [input_folder + f for f in os.listdir(input_folder) if f[-3:].upper() == "PDB"]


fix_idxs = []
if same_structure:
# Assume all structures in a folder are the same: the energy creation step can be done only once
s = read_structure_and_check(all_pdbs[0])
s = Structure(all_pdbs[0])
s.build()
#print(s.data_array.shape, s._data_array.shape)
if fix is not None:
fix = fix.strip()
if fix.find('-') == -1:
fix_idxs = [int(n) for n in fix.split('+')]
else:
if fix.find('+') == -1:
fix_id_chunks = [fix]
else:
fix_id_chunks = fix.split('+')
for chunk in fix_id_chunks:
if chunk.find('-') == -1:
fix_idxs += [int(chunk)]
else:
fix_range = [int(n) for n in chunk.split('-')]
if len(fix_range) > 2:
print('--fix argument syntax error. Abort')
return
start, stop = fix_range
fix_idxs += list(range(start, stop+1))
if any(array(fix_idxs) > s.res_nums[-1]):
print('--fix residue id out of range')
return
# remove added sidechains in sections to be processed
s = s.remove_side_chains(fix_idxs)

initialize_func_calc(partial(prepare_energy_function, batch_size=batch_size,
forcefield=ff_obj, terms=["lj", "clash"]),
structure=s)
forcefield=ff_obj, terms=["lj", "clash", "coulomb"]),
structure=s, retain_idxs=fix_idxs)
if mode == "simple" and same_structure and n_worker > 1:
# parallel executing sequential trials on the same structure (different conformations)
t0 = datetime.now()
structures = [read_structure_and_check(f) for f in all_pdbs]
structures = [read_structure_and_check(f, fix_idxs, verbose) for f in all_pdbs]
side_chain_parallel_creator = partial(create_side_chain,
n_trials=n_conf,
temperature=300,
retain_resi=fix_idxs,
parallel_worker=1,
return_first_valid=True)
import multiprocessing
Expand Down Expand Up @@ -142,26 +190,28 @@ def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_siz
output_dir = base_dir + "/" + os.path.splitext(os.path.basename(f))[0] + "_mcsce"
if not os.path.exists(output_dir) and mode == "ensemble":
os.makedirs(output_dir)
s = read_structure_and_check(f)
s = read_structure_and_check(f, fix_idxs, verbose=verbose)


if not same_structure:
initialize_func_calc(partial(prepare_energy_function, batch_size=batch_size,
forcefield=ff_obj, terms=["lj", "clash"]), structure=s)
forcefield=ff_obj, terms=["lj", "clash", "coulomb"]), structure=s)

if mode == "ensemble":
conformations, success_indicator = create_side_chain_ensemble(
s,
n_conf,
temperature=300,
retain_resi=fix_idxs,
save_path=output_dir,
parallel_worker=n_worker,
)

with open(logfile, "a") as _log:
_log.write("%s,%d,%s\n" % (f, sum(success_indicator), str(datetime.now() - t0)))
else:
best_structure = create_side_chain(s, n_conf, 300, n_worker, mode=="simple")
best_structure = create_side_chain(s, n_conf, 300, parallel_worker=n_worker,
retain_resi=fix_idxs, return_first_valid=(mode=="simple"))
if best_structure is not None:
best_structure.write_PDB(output_dir + ".pdb")
with open(logfile, "a") as _log:
Expand Down
Loading

0 comments on commit 56da7cf

Please sign in to comment.