From 3032ac6dc3bb9e437e8b07512ada4c67d19481ad Mon Sep 17 00:00:00 2001 From: Jerry Date: Tue, 14 Mar 2023 11:45:15 -0700 Subject: [PATCH 1/4] Add hints about auto fixing HIS residues --- src/mcsce/cli.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/mcsce/cli.py b/src/mcsce/cli.py index de2f9bb..22d3c25 100644 --- a/src/mcsce/cli.py +++ b/src/mcsce/cli.py @@ -42,12 +42,22 @@ def read_structure_and_check(file_name): """ s = Structure(file_name) s.build() + his_id = [str(resid) for resid, res_type in zip(s.residues, s.residue_types) if res_type == "HIS"] + if len(his_id) > 0: + message = f"WARNING! Undefined histidine protonation status found for structure [{file_name}]:\n" + message += " ".join(his_id) + message += "\nThese residues have been modified to HIP\n" + res_labels = s.res_labels + res_labels[res_labels == "HIS"] = "HIP" + s.res_labels = res_labels + print(message) missing_backbone_atoms = s.check_backbone_atom_completeness() if len(missing_backbone_atoms) > 0: message = f"WARNING! These atoms are missing from the current backbone structure [{file_name}]:" for resid, atom_name in missing_backbone_atoms: message += f"\n{resid} {atom_name}" print(message + "\n") + return s From 21971013b40a6b3794ad926c6ecbfb474dcf028e Mon Sep 17 00:00:00 2001 From: Jerry Date: Tue, 14 Mar 2023 11:46:43 -0700 Subject: [PATCH 2/4] functional updates to support proteins with multiple chains --- src/mcsce/core/side_chain_builder.py | 7 +++- src/mcsce/libs/libenergy.py | 22 +++++++----- src/mcsce/libs/libparse.py | 8 ++--- src/mcsce/libs/libstructure.py | 51 ++++++++++++++++++++++------ 4 files changed, 65 insertions(+), 23 deletions(-) diff --git a/src/mcsce/core/side_chain_builder.py b/src/mcsce/core/side_chain_builder.py index 34fbfb6..f59a883 100644 --- a/src/mcsce/core/side_chain_builder.py +++ b/src/mcsce/core/side_chain_builder.py @@ -72,9 +72,12 @@ def initialize_func_calc(efunc_creator, aa_seq=None, structure=None): aa_seq = structure.residue_types structure = deepcopy(structure) sidechain_placeholders.append(deepcopy(structure)) + n_terms, c_terms = structure.get_terminal_res_atom_arr() energy_calculators.append(efunc_creator(structure.atom_labels, structure.res_nums, - structure.res_labels)) + structure.res_labels, + n_terms, + c_terms)) for idx, resname in tqdm(enumerate(aa_seq), total=len(aa_seq)): template = sidechain_templates[resname] structure.add_side_chain(idx + 1, template) @@ -85,6 +88,8 @@ def initialize_func_calc(efunc_creator, aa_seq=None, structure=None): energy_func = efunc_creator(structure.atom_labels, structure.res_nums, structure.res_labels, + n_terms, + c_terms, partial_indices=[all_indices[-n_sidechain_atoms:], all_indices[:-n_sidechain_atoms]]) energy_calculators.append(energy_func) diff --git a/src/mcsce/libs/libenergy.py b/src/mcsce/libs/libenergy.py index 7a8ee38..1e8283f 100644 --- a/src/mcsce/libs/libenergy.py +++ b/src/mcsce/libs/libenergy.py @@ -28,6 +28,8 @@ def prepare_energy_function( atom_labels, residue_numbers, residue_labels, + N_terminal_indicators, + C_terminal_indicators, forcefield, batch_size=16, partial_indices=None, @@ -163,6 +165,8 @@ def prepare_energy_function( new_indices, old_indices, forcefield.forcefield, + N_terminal_indicators, + C_terminal_indicators, ) # 0.2 as 0.4 @@ -407,6 +411,8 @@ def create_LJ_params_raw( new_indices, old_indices, force_field, + n_terminal_indicators, + c_terminal_indicators, ): """Create ACOEFF and BCOEFF parameters. Borrowed from IDP Conformer Generator package (https://github.com/julie-forman-kay-lab/IDPConformerGenerator) developed by Joao M. C. Teixeira""" @@ -414,8 +420,8 @@ def create_LJ_params_raw( atom_labels[new_indices], residue_numbers[new_indices], residue_labels[new_indices], - min(residue_numbers), - max(residue_numbers), + n_terminal_indicators, + c_terminal_indicators, force_field, 'sigma', ) @@ -424,8 +430,8 @@ def create_LJ_params_raw( atom_labels[old_indices], residue_numbers[old_indices], residue_labels[old_indices], - min(residue_numbers), - max(residue_numbers), + n_terminal_indicators, + c_terminal_indicators, force_field, 'sigma', ) @@ -434,8 +440,8 @@ def create_LJ_params_raw( atom_labels[new_indices], residue_numbers[new_indices], residue_labels[new_indices], - min(residue_numbers), - max(residue_numbers), + n_terminal_indicators, + c_terminal_indicators, force_field, 'epsilon', ) @@ -444,8 +450,8 @@ def create_LJ_params_raw( atom_labels[old_indices], residue_numbers[old_indices], residue_labels[old_indices], - min(residue_numbers), - max(residue_numbers), + n_terminal_indicators, + c_terminal_indicators, force_field, 'epsilon', ) diff --git a/src/mcsce/libs/libparse.py b/src/mcsce/libs/libparse.py index 32cced2..986f601 100644 --- a/src/mcsce/libs/libparse.py +++ b/src/mcsce/libs/libparse.py @@ -415,16 +415,16 @@ def extract_ff_params_for_seq( params_l = [] params_append = params_l.append - zipit = zip(atom_labels, residue_numbers, residue_labels) - for atom_name, res_num, res_label in zipit: + zipit = zip(atom_labels, residue_numbers, residue_labels, n_terminal_idx, c_terminal_idx) + for atom_name, res_num, res_label, is_nterm, is_cterm in zipit: # adds N and C to the terminal residues - if res_num == n_terminal_idx: + if is_nterm: res = 'N' + res_label assert res.isupper() and len(res) == 4, res - elif res_num == c_terminal_idx: + elif is_cterm: res = 'C' + res_label assert res.isupper() and len(res) == 4, res else: diff --git a/src/mcsce/libs/libstructure.py b/src/mcsce/libs/libstructure.py index e2d9a21..4533426 100644 --- a/src/mcsce/libs/libstructure.py +++ b/src/mcsce/libs/libstructure.py @@ -158,6 +158,29 @@ def coords(self): def coords(self, coords): self.data_array[:, cols_coords] = \ np.round(coords, decimals=3).astype(' Date: Tue, 14 Mar 2023 15:48:47 -0700 Subject: [PATCH 3/4] partial bug fixes for multiple-sequence model --- src/mcsce/core/side_chain_builder.py | 1 + src/mcsce/libs/libenergy.py | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/mcsce/core/side_chain_builder.py b/src/mcsce/core/side_chain_builder.py index f59a883..587a89f 100644 --- a/src/mcsce/core/side_chain_builder.py +++ b/src/mcsce/core/side_chain_builder.py @@ -85,6 +85,7 @@ def initialize_func_calc(efunc_creator, aa_seq=None, structure=None): if resname not in ["GLY", "ALA"]: n_sidechain_atoms = len(template[1]) all_indices = np.arange(len(structure.atom_labels)) + n_terms, c_terms = structure.get_terminal_res_atom_arr() energy_func = efunc_creator(structure.atom_labels, structure.res_nums, structure.res_labels, diff --git a/src/mcsce/libs/libenergy.py b/src/mcsce/libs/libenergy.py index 1e8283f..3713434 100644 --- a/src/mcsce/libs/libenergy.py +++ b/src/mcsce/libs/libenergy.py @@ -391,7 +391,7 @@ def create_bonds_apart_mask_for_ij_pairs_old( residue_labels_ij_gen, residue_numbers_ij_gen, atom_labels_ij_gen, - bonds_intra, + bonds_atom_labelsintra, bonds_inter, ) @@ -416,12 +416,13 @@ def create_LJ_params_raw( ): """Create ACOEFF and BCOEFF parameters. Borrowed from IDP Conformer Generator package (https://github.com/julie-forman-kay-lab/IDPConformerGenerator) developed by Joao M. C. Teixeira""" + sigmas_ii_new = extract_ff_params_for_seq( atom_labels[new_indices], residue_numbers[new_indices], residue_labels[new_indices], - n_terminal_indicators, - c_terminal_indicators, + n_terminal_indicators[new_indices], + c_terminal_indicators[new_indices], force_field, 'sigma', ) @@ -430,8 +431,8 @@ def create_LJ_params_raw( atom_labels[old_indices], residue_numbers[old_indices], residue_labels[old_indices], - n_terminal_indicators, - c_terminal_indicators, + n_terminal_indicators[old_indices], + c_terminal_indicators[old_indices], force_field, 'sigma', ) @@ -440,8 +441,8 @@ def create_LJ_params_raw( atom_labels[new_indices], residue_numbers[new_indices], residue_labels[new_indices], - n_terminal_indicators, - c_terminal_indicators, + n_terminal_indicators[new_indices], + c_terminal_indicators[new_indices], force_field, 'epsilon', ) @@ -450,8 +451,8 @@ def create_LJ_params_raw( atom_labels[old_indices], residue_numbers[old_indices], residue_labels[old_indices], - n_terminal_indicators, - c_terminal_indicators, + n_terminal_indicators[old_indices], + c_terminal_indicators[old_indices], force_field, 'epsilon', ) @@ -469,6 +470,7 @@ def create_LJ_params_raw( # mixing rules epsilons_ij = epsilons_ij_pre ** 0.5 + # mixing + nm to Angstrom converstion # / 2 and * 10 sigmas_ij = sigmas_ij_pre * 5 From 17945661c8c5810a48c931b7c0a0415053fbed5f Mon Sep 17 00:00:00 2001 From: Jerry Date: Tue, 14 Mar 2023 18:04:31 -0700 Subject: [PATCH 4/4] more fixes to handle multi-chain proteins --- src/mcsce/cli.py | 6 ++---- src/mcsce/core/side_chain_builder.py | 5 +++-- src/mcsce/libs/libstructure.py | 16 +++++++++++++++- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/mcsce/cli.py b/src/mcsce/cli.py index 22d3c25..9e136bc 100644 --- a/src/mcsce/cli.py +++ b/src/mcsce/cli.py @@ -57,7 +57,7 @@ def read_structure_and_check(file_name): for resid, atom_name in missing_backbone_atoms: message += f"\n{resid} {atom_name}" print(message + "\n") - + s = s.remove_side_chains() return s @@ -98,9 +98,7 @@ def main(input_structure, n_conf, n_worker, output_dir, logfile, mode, batch_siz if same_structure: # Assume all structures in a folder are the same: the energy creation step can be done only once - s = Structure(all_pdbs[0]) - s.build() - s = s.remove_side_chains() + s = read_structure_and_check(all_pdbs[0]) initialize_func_calc(partial(prepare_energy_function, batch_size=batch_size, forcefield=ff_obj, terms=["lj", "clash"]), structure=s) diff --git a/src/mcsce/core/side_chain_builder.py b/src/mcsce/core/side_chain_builder.py index 587a89f..02dfd94 100644 --- a/src/mcsce/core/side_chain_builder.py +++ b/src/mcsce/core/side_chain_builder.py @@ -71,6 +71,7 @@ def initialize_func_calc(efunc_creator, aa_seq=None, structure=None): # extract amino acid sequence from structure object aa_seq = structure.residue_types structure = deepcopy(structure) + chain_ids = structure.residue_chains sidechain_placeholders.append(deepcopy(structure)) n_terms, c_terms = structure.get_terminal_res_atom_arr() energy_calculators.append(efunc_creator(structure.atom_labels, @@ -78,9 +79,9 @@ def initialize_func_calc(efunc_creator, aa_seq=None, structure=None): structure.res_labels, n_terms, c_terms)) - for idx, resname in tqdm(enumerate(aa_seq), total=len(aa_seq)): + for idx, resname, chain_id in tqdm(zip(range(len(aa_seq)), aa_seq, chain_ids), total=len(aa_seq)): template = sidechain_templates[resname] - structure.add_side_chain(idx + 1, template) + structure.add_side_chain(idx + 1, template, chain_id) sidechain_placeholders.append(deepcopy(structure)) if resname not in ["GLY", "ALA"]: n_sidechain_atoms = len(template[1]) diff --git a/src/mcsce/libs/libstructure.py b/src/mcsce/libs/libstructure.py index 4533426..7cfaa6c 100644 --- a/src/mcsce/libs/libstructure.py +++ b/src/mcsce/libs/libstructure.py @@ -223,6 +223,19 @@ def residue_types(self): for chain_residues in chains.values(): restypes.extend(list(chain_residues.values())) return restypes + + @property + def residue_chains(self): + c, rs, rn = col_chainID, col_resSeq, col_resName + + chains = defaultdict(dict) + for row in self.filtered_atoms: + chains[row[c]].setdefault(row[rs], row[rn]) + + chain_ids = [] + for chain_id in chains: + chain_ids.extend([chain_id] * len(chains[chain_id])) + return chain_ids @property def filtered_residues(self): @@ -479,7 +492,7 @@ def remove_side_chains(self): copied_structure._data_array = copied_structure.data_array[retained_atoms_filter] return copied_structure - def add_side_chain(self, res_idx, sidechain_template): + def add_side_chain(self, res_idx, sidechain_template, chain_id='A'): template_structure, sc_atoms = sidechain_template self.add_filter_resnum(res_idx) N_CA_C_coords = self.get_sorted_minimal_backbone_coords(filtered=True) @@ -487,6 +500,7 @@ def add_side_chain(self, res_idx, sidechain_template): sidechain_data_arr = template_structure.data_array.copy() sidechain_data_arr[:, cols_coords] = np.round(sc_all_atom_coords, decimals=3).astype('