Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement topocg module and helper functions #1007

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
cecec48
copy files from topoaa
rversin Aug 1, 2024
3da47ab
adapted the code to be used as a secondary module
rversin Aug 5, 2024
77a7a93
AA to CG mapping retrieved
rversin Aug 5, 2024
87e0ee9
helper of aa2cg.py
rversin Aug 5, 2024
1b391f7
path to helper corrected
rversin Aug 5, 2024
606ad87
pdb output path corrected
rversin Aug 7, 2024
4048363
topology file created
rversin Aug 9, 2024
193c1a1
correct file names
rversin Aug 16, 2024
eb88399
files renamed
rversin Aug 19, 2024
6006b95
force-field parameter added to the module
rversin Aug 20, 2024
42997cf
name change of the variable ffversion to cgffversion
rversin Aug 28, 2024
50c4e6a
adaptation of the cns file for the generation of topology
rversin Aug 28, 2024
85054c9
cgffversion parameter is changed from string to integer
rversin Sep 4, 2024
5e6b852
topology files named as a function of the MARTINI version
rversin Sep 4, 2024
0b07eda
nucleic acid topology file as function of the MARTINI version
rversin Sep 5, 2024
605a887
cgffversion modified from integer to string
rversin Sep 5, 2024
53b5380
pipeline removed: addition of a covalent bond between acetylated Nter…
rversin Sep 11, 2024
b08ab61
pipeline removed: addition of a covalent bond between a hemeC and a c…
rversin Sep 11, 2024
78c1a50
martini3 option removed
rversin Sep 11, 2024
8af1d3e
pipeline removed: build missing atoms (build-missing.cns)
rversin Sep 12, 2024
7a6b29d
files corrected
rversin Sep 20, 2024
7084271
test files added
rversin Sep 23, 2024
eb354ff
parameter files recreated
rversin Sep 23, 2024
98f1a3f
secondary structures analyzed by aa2cg.py
rversin Sep 25, 2024
1c817c3
parameter files corrected
rversin Sep 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 331 additions & 4 deletions src/haddock/modules/topology/topocg/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,339 @@
"""Create and manage CNS coarse-grained-atom topology.
"""Create and manage CNS all-atom topology.

This module is under development...
The ``[topocg]`` module is dedicated to the generation of CNS compatible
parameters (.param) and topologies (.psf) for each of the input structures.

It will:
- Detect missing atoms, including hydrogens
- Re-build them when missing
- Build and write out topologies (psf) and coordinates (pdb) files

This module is a pre-requisite to run any downstream modules using CNS.
Having access to parameters and topology is mandatory for any kind
of EM/MD related tasks.
Therefore this is the reason why the module [topocg] is often used as first
module in a workflow.

Note that for non-standard bio-molecules
(apart from standard amino-acids, some modified ones, DNA, RNA, ions
and carbohydrates ... see `detailed list of supported molecules
<https://wenmr.science.uu.nl/haddock2.4/library>`_),
such as small-molecules, parameters and topology must be obtained and provided
by the user, as there is currently no built-in solution to generate
them on the fly.
"""
# to be developed
# adding dummy interfaces to pass the tests

import operator
import os
import re
from functools import partial
from pathlib import Path

from haddock.core.typing import FilePath, Optional, ParamDict, ParamMap, Union
from haddock.libs import libpdb
from haddock.libs.libcns import (
generate_default_header,
load_workflow_params,
prepare_output,
prepare_single_input,
)
from haddock.libs.libontology import Format, PDBFile, TopologyFile
from haddock.libs.libstructure import make_molecules
from haddock.libs.libontology import PDBFile
from haddock.libs.libstructure import Molecule
from haddock.libs.libsubprocess import CNSJob
from haddock.modules import get_engine
from haddock.modules.base_cns_module import BaseCNSModule

import haddock.modules.topology.topocg.aa2cg


RECIPE_PATH = Path(__file__).resolve().parent
DEFAULT_CONFIG = Path(RECIPE_PATH, "defaults.yaml")


def generate_topology(
input_pdb: Path,
output_path: str,
recipe_str: str,
defaults: ParamMap,
mol_params: ParamMap,
default_params_path: Optional[FilePath] = None,
write_to_disk: Optional[bool] = True,
) -> Union[Path, str]:
"""Generate a HADDOCK topology file from input_pdb."""
# generate params headers
general_param = load_workflow_params(**defaults)
input_mols_params = load_workflow_params(param_header="", **mol_params)
general_param = general_param + input_mols_params

# generate default headers
link, trans_vec, tensor, scatter, axis, water_box = generate_default_header(
path=default_params_path
)

# AA to CG
rversin marked this conversation as resolved.
Show resolved Hide resolved
cg_pdb_name = aa2cg.martinize(input_pdb, output_path, True)

output = prepare_output(
output_pdb_filename=f"{cg_pdb_name[:-4]}_processed{input_pdb.suffix}",
output_psf_filename=f"{cg_pdb_name[:-4]}_processed.{Format.TOPOLOGY}",
)

input_str = prepare_single_input(str(cg_pdb_name))

inp_parts = (
general_param,
input_str,
output,
link,
trans_vec,
tensor,
scatter,
axis,
water_box,
recipe_str,
)

inp = "".join(inp_parts)
# change the parameter files as function of the force-field version

if write_to_disk:
output_inp_filename = Path(f"{input_pdb.stem}.{Format.CNS_INPUT}")
output_inp_filename.write_text(inp)
return output_inp_filename
else:
return inp


class HaddockModule(BaseCNSModule):
"""HADDOCK3 module to create CNS all-atom topologies."""

name = RECIPE_PATH.name

def __init__(
self, order: int, path: Path, initial_params: FilePath = DEFAULT_CONFIG
) -> None:
cns_script = RECIPE_PATH / "cns" / "generate-topology.cns"
super().__init__(order, path, initial_params, cns_script=cns_script)

@classmethod
def confirm_installation(cls) -> None:
"""Confirm if module is installed."""
return

@staticmethod
def get_md5(ensemble_f: FilePath) -> dict[int, str]:
"""Get MD5 hash of a multi-model PDB file."""
md5_dic: dict[int, str] = {}
text = Path(ensemble_f).read_text()
lines = text.split(os.linesep)
REMARK_lines = (line for line in lines if line.startswith("REMARK"))
remd5 = re.compile(r"^[a-f0-9]{32}$")
for line in REMARK_lines:
parts = line.strip().split()

try:
idx = parts.index("MODEL")
except ValueError: # MODEL not in parts, this line can be ignored
continue

# check if there's a md5 hash in line
for part in parts:
group = remd5.fullmatch(part)
if group:
# the model num comes after the MODEL
model_num = int(parts[idx + 1])
md5_dic[model_num] = group.string # md5 hash
break

return md5_dic

@staticmethod
def get_ensemble_origin(ensemble_f: FilePath) -> dict[int, str]:
"""Try to find origin for each model in ensemble.

Parameters
----------
ensemble_f : FilePath
Path to a pdb file containing an ensemble.

Returns
-------
origin_dic : dict[int, str]
Dictionary holding as keys the modelID and values its origin.
"""
origin_dic: dict[int, str] = {}
text = Path(ensemble_f).read_text()
lines = text.split(os.linesep)
REMARK_lines = (line for line in lines if line.startswith("REMARK"))
re_origin = re.compile(
"REMARK\s+MODEL\s+(\d+)\s+(FROM|from|From)\s+(([\w_-]+\.?)+)"
) # noqa : E501
for line in REMARK_lines:
if match := re_origin.search(line):
model_num = int(match.group(1).strip())
original_path = match.group(3).strip()
original_name = Path(original_path).stem
origin_dic[model_num] = original_name
return origin_dic

def _run(self) -> None:
"""Execute module."""

try:
_molecules = self.previous_io.retrieve_models(individualize=True)
molecules_paths: list[Path] = [mol.rel_path for mol in _molecules] # type: ignore
molecules = make_molecules(molecules_paths, no_parent=True)
except Exception as e:
self.finish_with_error(e)

# extracts `input` key from params. The `input` keyword needs to
# be treated separately
mol_params: ParamDict = {}
for k in list(self.params.keys()):
if k.startswith("mol") and k[3:].isdigit():
mol_params[k] = self.params.pop(k)

# to facilitate the for loop down the line, we create a list with the
# keys of `mol_params` with inverted order (we will use .pop)
mol_params_keys = list(mol_params.keys())[::-1]

# limit is only useful when order == 0
if self.order == 0 and self.params["limit"]:
mol_params_get = mol_params_keys.pop

# `else` is used in any case where limit is False.
else:
mol_params_get = partial(operator.getitem, mol_params_keys, -1)

# Pool of jobs to be executed by the CNS engine
jobs: list[CNSJob] = []

models_dic: dict[int, list[Path]] = {}
ens_dic: dict[int, dict[int, str]] = {}
origi_ens_dic: dict[int, dict[int, str]] = {}
for i, molecule in enumerate(molecules, start=1):
self.log(f"Molecule {i}: {molecule.with_parent}")
models_dic[i] = []
# Copy the molecule to the step folder

# Split models
self.log(
f"Split models if needed for {molecule.with_parent}",
level="debug",
)
# these come already sorted
splited_models = libpdb.split_ensemble(
Path(molecule.with_parent),
dest=Path.cwd(),
)

# get the MD5 hash of each model
ens_dic[i] = self.get_md5(molecule.with_parent)
origi_ens_dic[i] = self.get_ensemble_origin(molecule.with_parent)
# nice variable name, isn't it? :-)
# molecule parameters are shared among models of the same molecule
parameters_for_this_molecule = mol_params[mol_params_get()]

for task_id, model in enumerate(splited_models):
self.log(f"Sanitizing molecule {model.name}")
models_dic[i].append(model)

if self.params["ligand_top_fname"]:
custom_top = self.params["ligand_top_fname"]
self.log(f"Using custom topology {custom_top}")
libpdb.sanitize(
model,
overwrite=True,
custom_topology=custom_top,
)

else:
libpdb.sanitize(model, overwrite=True)

# Prepare generation of topologies jobs
topocg_input = generate_topology(
model,
self.path,
self.recipe_str,
self.params,
parameters_for_this_molecule,
default_params_path=self.toppar_path,
write_to_disk=not self.params["less_io"],
)

self.log("Topology CNS input created")

# Add new job to the pool
output_filename = Path(f"{model.stem}.{Format.CNS_OUTPUT}")

job = CNSJob(
topocg_input,
output_filename,
envvars=self.envvars,
cns_exec=self.params["cns_exec"],
)

jobs.append(job)

# Run CNS Jobs
self.log(f"Running CNS Jobs n={len(jobs)}")
Engine = get_engine(self.params["mode"], self.params)
engine = Engine(jobs)
engine.run()
self.log("CNS jobs have finished")

# Check for generated output, fail it not all expected files
# are found
expected: dict[int, dict[int, PDBFile]] = {}
for i in models_dic:
expected[i] = {}
md5_dic = ens_dic[i]
origin_names = origi_ens_dic[i]
for j, model in enumerate(models_dic[i]):
md5_hash = None
try:
model_id = int(model.stem.split("_")[-1])
except ValueError:
model_id = 0

if model_id in md5_dic:
md5_hash = md5_dic[model_id]

model_name = model.stem
processed_pdb = Path(f"{model_name}_cg_processed.{Format.PDB}")
processed_topology = Path(f"{model_name}_cg_processed.{Format.TOPOLOGY}")

# Check if origin or md5 is available
if md5_hash or model_id in origin_names.keys():
# Select prefix
if md5_hash:
prefix_name = md5_hash
else:
prefix_name = origin_names[model_id]
# Check if topology and file created
if processed_pdb.exists() and processed_topology.exists():
# Build new filename
model_name = f"{prefix_name}_from_{model_name}"
# Rename files
processed_pdb = processed_pdb.rename(
f"{model_name}_haddock_cg_processed.{Format.PDB}"
)
processed_topology = processed_topology.rename(
f"{model_name}_haddock_cg_processed.{Format.TOPOLOGY}"
)

topology = TopologyFile(processed_topology, path=".")
pdb = PDBFile(
file_name=processed_pdb,
topology=topology,
path=".",
md5=md5_hash,
)
pdb.ori_name = model.stem
expected[i][j] = pdb

# Save module information
self.output_models = list(expected.values()) # type: ignore
self.export_io_models(faulty_tolerance=self.params["tolerance"])
Loading