Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rotamer libraries for phosphorated residues and functions to preserve original sidechains #29

Merged
merged 23 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
.DS_Store

# C extensions
*.so
Expand Down Expand Up @@ -112,3 +113,12 @@ venv.bak/

# Dunbrack library files
SimpleOpt1-5/

# in development files
src/mcsce/core/check_library.py
src/mcsce/*.pdb
src/mcsce/*_mcsce/
src/mcsce/*.csv
src/mcsce/test_pdbs/
src/mcsce/core/data/glycam_06j.xml
src/mcsce/core/sidechain_templates/cap_templates/
24 changes: 23 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,29 @@ MCSCE - Sidechain packing library
Monte Carlo Side Chain Entropy package for generating side chain packing for
fixed protein backbone.

v0.0.0
Updated supports for phosphorated residues and other listed modifications; other ptms in development.

Phosphoralytion(unprotonated, protonated)

- SER: SEP S1P
- THR: TPO T1P
- TYR: PTR Y1P
- HID: H2D H1D
- HIE: H2E H1E

Methylation

- LYS: M3L

N6-carboxylysine

- LYS: KCX

Hydroxylation

- PRO: HYP

v0.1.0

References
==========
Expand Down
88 changes: 69 additions & 19 deletions src/mcsce/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from datetime import datetime

from numpy import mod
from numpy import mod, any, array
from tqdm.std import tqdm

from mcsce.libs.libstructure import Structure
Expand All @@ -17,7 +17,9 @@
parser.add_argument("-s", "--same_structure", action="store_true", default=False, help="When generating PDB files in a folder, whether each structure in the folder has the same amino acid sequence. When this is set to True, the energy function preparation will be done only once.")
parser.add_argument("-b", "--batch_size", default=16, type=int, help="The batch size used for calculating energies for conformations. Consider decreasing batch size when encountered OOM in building.")
parser.add_argument("-m", "--mode", choices=["ensemble", "simple", "exhaustive"], default="ensemble", help="This option controls whether a structural ensemble or just a single structure is created for every input structure. The default behavior is to create a structural ensemble. Simple/exhaustive modes are for creating single structure. In simple mode, n_conf structures are tried sequentially and the first valid structure is returned, and in exhaustive mode, a total number of n_conf structures will be created and the lowest energy conformation is returned.")
parser.add_argument("-f", "--fix", default=None, help="list of residue ids whose side chains are retained. Specify start to stop (inclusive) with dash, and seperate id ranges with plus. E.g. 2-5+10-13. Only supports same structure mode and input structure should contain original side chains for residues to be fixed. Currently assumes continuous residue ids.")
parser.add_argument("-l", "--logfile", default="log.csv", help="File name of the log file")
parser.add_argument("-v", "--verbose", default=False, help="whether to print out warnings")


def load_args(parser):
Expand All @@ -35,10 +37,10 @@ def maincli():
"""Independent client entry point."""
cli(parser, main)

def read_structure_and_check(file_name):
def read_structure_and_check(file_name, retain_idx=[], verbose=False):
"""
Helper function for reading a structure from a given filename and returns the structure object
Also checks whether there are missing atoms in the structure
checks whether there are missing atoms in the structure and
"""
s = Structure(file_name)
s.build()
Expand All @@ -50,21 +52,27 @@ def read_structure_and_check(file_name):
res_labels = s.res_labels
res_labels[res_labels == "HIS"] = "HIP"
s.res_labels = res_labels
print(message)
if verbose:
print(message)
else:
print("HIS residues modified to HIP\n")
missing_backbone_atoms = s.check_backbone_atom_completeness()
if len(missing_backbone_atoms) > 0:
message = f"WARNING! These atoms are missing from the current backbone structure [{file_name}]:"
for resid, atom_name in missing_backbone_atoms:
message += f"\n{resid} {atom_name}"
print(message + "\n")
s = s.remove_side_chains()
if verbose:
print(message + "\n")
else:
print("WARNING! %d missing atoms in total"%(len(message.split("\n")) - 1))
s = s.remove_side_chains(retain_idx)
return s





def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_size=4, same_structure=False):
def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, fix, batch_size=4, same_structure=False, verbose=False):

# antipattern to save time
from mcsce.core.side_chain_builder import initialize_func_calc, create_side_chain_ensemble, create_side_chain
Expand All @@ -78,15 +86,26 @@ def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_siz
f.write("PDB name,Succeeded,Time used\n")

ff = forcefields["Amberff14SB"]
ff_obj = ff(add_OXT=True, add_Nterminal_H=True)

ff_obj = ff(Cterminal='OXT', Nterminal='HN')

if isinstance(fix, str):
if fix.find('-') + fix.find('+') <= -2 :
print('--fix argument syntax error. Abort')
return
elif not same_structure:
print('--fix argument ignored')
if not isinstance(fix, str) and fix is not None:
print('--fix argument syntax error. Abort')
return

if n_worker is None:
import multiprocessing
n_worker = multiprocessing.cpu_count() - 1
else:
n_worker = n_worker
print("# workers:", n_worker)
print("mode: ", mode)
if verbose:
print("# workers:", n_worker)
print("mode: ", mode)

if input_structure[-3:].upper() == "PDB":
all_pdbs = [input_structure]
Expand All @@ -95,20 +114,49 @@ def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_siz
if not input_folder.endswith("/"):
input_folder += "/"
all_pdbs = [input_folder + f for f in os.listdir(input_folder) if f[-3:].upper() == "PDB"]


fix_idxs = []
if same_structure:
# Assume all structures in a folder are the same: the energy creation step can be done only once
s = read_structure_and_check(all_pdbs[0])
s = Structure(all_pdbs[0])
s.build()
#print(s.data_array.shape, s._data_array.shape)
if fix is not None:
fix = fix.strip()
if fix.find('-') == -1:
fix_idxs = [int(n) for n in fix.split('+')]
else:
if fix.find('+') == -1:
fix_id_chunks = [fix]
else:
fix_id_chunks = fix.split('+')
for chunk in fix_id_chunks:
if chunk.find('-') == -1:
fix_idxs += [int(chunk)]
else:
fix_range = [int(n) for n in chunk.split('-')]
if len(fix_range) > 2:
print('--fix argument syntax error. Abort')
return
start, stop = fix_range
fix_idxs += list(range(start, stop+1))
if any(array(fix_idxs) > s.res_nums[-1]):
print('--fix residue id out of range')
return
# remove added sidechains in sections to be processed
s = s.remove_side_chains(fix_idxs)

initialize_func_calc(partial(prepare_energy_function, batch_size=batch_size,
forcefield=ff_obj, terms=["lj", "clash"]),
structure=s)
forcefield=ff_obj, terms=["lj", "clash", "coulomb"]),
structure=s, retain_idxs=fix_idxs)
if mode == "simple" and same_structure and n_worker > 1:
# parallel executing sequential trials on the same structure (different conformations)
t0 = datetime.now()
structures = [read_structure_and_check(f) for f in all_pdbs]
structures = [read_structure_and_check(f, fix_idxs, verbose) for f in all_pdbs]
side_chain_parallel_creator = partial(create_side_chain,
n_trials=n_conf,
temperature=300,
retain_resi=fix_idxs,
parallel_worker=1,
return_first_valid=True)
import multiprocessing
Expand Down Expand Up @@ -142,26 +190,28 @@ def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_siz
output_dir = base_dir + "/" + os.path.splitext(os.path.basename(f))[0] + "_mcsce"
if not os.path.exists(output_dir) and mode == "ensemble":
os.makedirs(output_dir)
s = read_structure_and_check(f)
s = read_structure_and_check(f, fix_idxs, verbose=verbose)


if not same_structure:
initialize_func_calc(partial(prepare_energy_function, batch_size=batch_size,
forcefield=ff_obj, terms=["lj", "clash"]), structure=s)
forcefield=ff_obj, terms=["lj", "clash", "coulomb"]), structure=s)

if mode == "ensemble":
conformations, success_indicator = create_side_chain_ensemble(
s,
n_conf,
temperature=300,
retain_resi=fix_idxs,
save_path=output_dir,
parallel_worker=n_worker,
)

with open(logfile, "a") as _log:
_log.write("%s,%d,%s\n" % (f, sum(success_indicator), str(datetime.now() - t0)))
else:
best_structure = create_side_chain(s, n_conf, 300, n_worker, mode=="simple")
best_structure = create_side_chain(s, n_conf, 300, parallel_worker=n_worker,
retain_resi=fix_idxs, return_first_valid=(mode=="simple"))
if best_structure is not None:
best_structure.write_PDB(output_dir + ".pdb")
with open(logfile, "a") as _log:
Expand Down
Loading
Loading