diff --git a/polymerist/genutils/importutils/dependencies.py b/polymerist/genutils/importutils/dependencies.py index 4787afe..a69b4c8 100644 --- a/polymerist/genutils/importutils/dependencies.py +++ b/polymerist/genutils/importutils/dependencies.py @@ -3,7 +3,7 @@ __author__ = 'Timotej Bernat' __email__ = 'timotej.bernat@colorado.edu' -from typing import Callable, ParamSpec, TypeVar +from typing import Callable, Optional, ParamSpec, TypeVar Params = ParamSpec('Params') ReturnType = TypeVar('ReturnType') @@ -14,6 +14,25 @@ from functools import wraps +class MissingPrerequisitePackage(Exception): + '''Raised when a package dependency cannot be found and the user should be alerted with install instructions''' + def __init__(self, + importing_package_name : str, + use_case : str, + install_link : str, + dependency_name : str, + dependency_name_formal : Optional[str]=None + ): + if dependency_name_formal is None: + dependency_name_formal = dependency_name + + message = f''' + {use_case.capitalize()} require(s) {dependency_name_formal}, which was not found in the current environment + Please install `{dependency_name}` by following the installation instructions at {install_link} + Then try importing from "{importing_package_name}" again''' + + super().__init__(message) + def module_installed(module_name : str) -> bool: ''' Check whether a module of the given name is present on the system diff --git a/polymerist/genutils/textual/strsearch.py b/polymerist/genutils/textual/strsearch.py deleted file mode 100644 index 919e797..0000000 --- a/polymerist/genutils/textual/strsearch.py +++ /dev/null @@ -1,41 +0,0 @@ -'''For searching and replacing through strings and text files''' - -__author__ = 'Timotej Bernat' -__email__ = 'timotej.bernat@colorado.edu' - -from typing import Callable, Optional -from pathlib import Path - -from ..fileutils.extensions import FileTypeError - - -def shortest_repeating_substring(string : str) -> str: - '''Return the shortest substring such that the passed string can be written as some number of repeats (including 1) of the substring - Will return the original string if no simpler decomposition exists''' - i = (2*string).find(string, 1, -1) # check if string matches itself in a cycle in non-trivial way (i.e more than just the two repeats) - return string if (i == -1) else string[:i] - -def filter_text_by_condition(in_text_path : Path, condition : Callable[[str], bool], out_text_path : Optional[Path]=None, postfix : str='filtered', inclusive : bool=True, return_filtered_path : bool=False) -> Optional[Path]: - '''Create a copy of a text-based file containing only the lines which match to a given boolean condition - - If no explicit output path is given, will create an output file in the same directory as the source file - with the same name plus "postfix" tacked on. Can optionally return the path to the filtered file (else None) - - "Inclusive" kw governs whether to write lines which DO or DON'T meet the condition''' - if out_text_path is None: - out_text_path = in_text_path.with_stem(f'{in_text_path.stem}{"_" if postfix else ""}{postfix}') - - if (out_text_path == in_text_path): - raise PermissionError(f'Attempting to overwrite {in_text_path} with regex filter') # prevent write clash - - if (out_text_path.suffix != in_text_path.suffix): # prevent file type conversion during transfer - raise FileTypeError(f'Input and output file must have same extension (not {in_text_path.suffix} and {out_text_path.suffix})') - - with out_text_path.open('w') as outfile: - with in_text_path.open('r') as infile: # readfile is innermost in case error occurs during file read (caught by handler one level up) - for line in infile: - if (condition(line) == inclusive): # only write lines if (matching AND inclusive) OR (not matching AND exclusive) - outfile.write(line) - - if return_filtered_path: - return out_text_path \ No newline at end of file diff --git a/polymerist/genutils/textual/substrings.py b/polymerist/genutils/textual/substrings.py new file mode 100644 index 0000000..83c6f4f --- /dev/null +++ b/polymerist/genutils/textual/substrings.py @@ -0,0 +1,81 @@ +'''For identifying and concatenating substrings of other strings with unique properties''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + + +def unique_string(string : str, preserve_order : bool=True) -> str: + ''' + Accepts a string and returns another string containing + only the UNIQUE characters in the origin string + + Can specify whether order is important with the "preserve_order" keyword + + Parameters + ---------- + string : str + An arbitrary string on wants the unique characters from + preserve_order : bool, default True + Whether or not to keep the unique characters in the order they are found + For example: + unique_string("balaclava", preserve_order=False) -> "bcavl" + unique_string("balaclava", preserve_order=True) -> "balcv" + + Returns + ------- + uniquified_str : str + Another string containing only the unique characters in "string" + Order depends on the value of the "preserve_order" parameter + ''' + if not preserve_order: + unique_chars = set(string) + else: + unique_chars = [] + for char in string: + if char not in unique_chars: + unique_chars.append(char) + + return ''.join(unique_chars) + +def shortest_repeating_substring(string : str) -> str: + '''Return the shortest substring such that the passed string can be written as some number of repeats (including 1) of the substring + Will return the original string if no simpler decomposition exists''' + i = (2*string).find(string, 1, -1) # check if string matches itself in a cycle in non-trivial way (i.e more than just the two repeats) + return string if (i == -1) else string[:i] + +def repeat_string_to_length(string : str, target_length : int, joiner : str='') -> str: + ''' + Takes a string and repeats it cyclically to produce another string of a given length + The number of times the original string occurs in the new string may be fractional + for example: + >> repeat_string_to_length("CAT", 6) -> "CATCAT" + >> repeat_string_to_length("BACA", 10) -> "BACABACABA" + + Parameters + ---------- + string : str + An arbitrary string to repeat + target_length : int + The length of the final desired string + This does NOT have to be an integer multiple of the length of "string" + E.g. repeat_string_to_length("BACA", 10) -> "BACABACABA" + Nor does it have to be greater than the length of "string" + E.g. repeat_string_to_length("BACA", 3) -> "BAC" + + Returns + ------- + rep_string : str + A new string which has the desired target length and consists of cycles of the initial string + ''' + if not string: + raise ValueError(f'Cannot generate nonempty string from any amount of repeats of the empty string') + if not isinstance(target_length, int): + raise TypeError(f'Only integer target string lengths are allowed, not non-integer type "{type(target_length).__name__}"') + if target_length < 0: + raise IndexError(f'Cannot generate a string of negative length (requested length of {target_length} character(s))') + + num_str_reps, num_extra_chars = divmod(target_length, len(string)) + remainder = (string[:num_extra_chars],) if num_extra_chars else () # empty container avoids extra joiner at end when remainder string is empty + + return joiner.join(num_str_reps*(string,) + remainder) # tuples here are ~2 OOM faster than moral equivalent with lists + \ No newline at end of file diff --git a/polymerist/mdtools/openfftools/__init__.py b/polymerist/mdtools/openfftools/__init__.py index 8f1ab65..f0c5f59 100644 --- a/polymerist/mdtools/openfftools/__init__.py +++ b/polymerist/mdtools/openfftools/__init__.py @@ -4,13 +4,14 @@ __email__ = 'timotej.bernat@colorado.edu' # Subpackage-wide precheck to see if OpenFF is even usable in the first place -from ...genutils.importutils.dependencies import modules_installed +from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage if not modules_installed('openff', 'openff.toolkit'): - raise ModuleNotFoundError( - f''' - OpenFF packages which are required to utilitize {__name__} not found in current environment - Please follow installation instructions at https://docs.openforcefield.org/projects/toolkit/en/stable/installation.html, then retry import - ''' + raise MissingPrerequisitePackage( + importing_package_name=__spec__.name, + use_case='OpenFF addons', + install_link='https://docs.openforcefield.org/projects/toolkit/en/stable/installation.html', + dependency_name='openff-toolkit', + dependency_name_formal='the OpenFF software stack', ) # Import of toplevel OpenFF object registries diff --git a/polymerist/mdtools/openfftools/boxvectors.py b/polymerist/mdtools/openfftools/boxvectors.py index 49ea028..7495d6a 100644 --- a/polymerist/mdtools/openfftools/boxvectors.py +++ b/polymerist/mdtools/openfftools/boxvectors.py @@ -14,7 +14,7 @@ from openff.toolkit import Topology from openff.interchange.components._packmol import _box_vectors_are_in_reduced_form -from .omminter.unitsys import allow_openmm_units, openff_to_openmm +from .unitsys import allow_openmm_units, openff_to_openmm # CUSTOM TYPES FOR CLARITY, ESPECIALLY WITH UNITS diff --git a/polymerist/mdtools/openfftools/omminter/__init__.py b/polymerist/mdtools/openfftools/omminter/__init__.py index db12755..e1522de 100644 --- a/polymerist/mdtools/openfftools/omminter/__init__.py +++ b/polymerist/mdtools/openfftools/omminter/__init__.py @@ -4,9 +4,3 @@ __email__ = 'timotej.bernat@colorado.edu' from .mdobjects import forcefield_flexible, openff_topology_to_openmm -from .unitsys import ( - openmm_to_openff, - openff_to_openmm, - allow_openmm_units, - allow_openff_units, -) \ No newline at end of file diff --git a/polymerist/mdtools/openfftools/omminter/mdobjects.py b/polymerist/mdtools/openfftools/omminter/mdobjects.py index 2ed2be9..7f0af55 100644 --- a/polymerist/mdtools/openfftools/omminter/mdobjects.py +++ b/polymerist/mdtools/openfftools/omminter/mdobjects.py @@ -15,7 +15,7 @@ from openmm.app import Topology as OMMTopology from openmm.unit import Quantity -from .unitsys import openff_to_openmm +from ..unitsys import openff_to_openmm from .. import FFDIR from ..boxvectors import box_vectors_flexible, VectorQuantity, BoxVectorsQuantity @@ -39,8 +39,13 @@ def forcefield_flexible(forcefield : Union[ForceField, str, Path]) -> ForceField return ForceField(ff_path) -def openff_topology_to_openmm(offtop : OFFTopology, forcefield : Union[ForceField, str, Path], box_vecs : Optional[Union[VectorQuantity, BoxVectorsQuantity]]=None, - combine_nonbonded_forces : bool=False, add_constrained_forces : bool=False) -> tuple[OMMTopology, System, Quantity]: +def openff_topology_to_openmm( + offtop : OFFTopology, + forcefield : Union[ForceField, str, Path], + box_vecs : Optional[Union[VectorQuantity, BoxVectorsQuantity]]=None, + combine_nonbonded_forces : bool=False, + add_constrained_forces : bool=False + ) -> tuple[OMMTopology, System, Quantity]: '''Converts an OpenFF Topology to an OpenMM Topology, System, and Positions''' if box_vecs is not None: offtop.box_vectors = box_vectors_flexible(box_vecs) diff --git a/polymerist/mdtools/openfftools/solvation/physprops.py b/polymerist/mdtools/openfftools/solvation/physprops.py index 1de70a2..bd9f108 100644 --- a/polymerist/mdtools/openfftools/solvation/physprops.py +++ b/polymerist/mdtools/openfftools/solvation/physprops.py @@ -15,7 +15,7 @@ from openff.units import Quantity as OFFQuantity from ....unitutils.dimensions import is_volume -from ..omminter.unitsys import allow_openff_units, openff_to_openmm +from ..unitsys import allow_openff_units, openff_to_openmm # MASS diff --git a/polymerist/mdtools/openfftools/solvation/solvents/__init__.py b/polymerist/mdtools/openfftools/solvation/solvents/__init__.py index f540da0..a11bf94 100644 --- a/polymerist/mdtools/openfftools/solvation/solvents/__init__.py +++ b/polymerist/mdtools/openfftools/solvation/solvents/__init__.py @@ -10,7 +10,6 @@ from openff.units import unit as offunit from ... import topology -from ... import TKREGS def generate_water_TIP3P() -> Molecule: diff --git a/polymerist/mdtools/openfftools/omminter/unitsys.py b/polymerist/mdtools/openfftools/unitsys.py similarity index 100% rename from polymerist/mdtools/openfftools/omminter/unitsys.py rename to polymerist/mdtools/openfftools/unitsys.py diff --git a/polymerist/mdtools/openmmtools/serialization.py b/polymerist/mdtools/openmmtools/serialization.py index 6bad38a..97348a6 100644 --- a/polymerist/mdtools/openmmtools/serialization.py +++ b/polymerist/mdtools/openmmtools/serialization.py @@ -23,6 +23,7 @@ from ...genutils.fileutils.pathutils import assemble_path from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable from ...genutils.fileutils.jsonio.serialize import PathSerializer +from ...molfiles.pdb import SerialAtomLabeller # DEFINING AND STORING SIMULATION PATHS @@ -119,12 +120,18 @@ def serialize_system(sys_path : Path, system : System) -> None: file.write(XmlSerializer.serialize(system)) @allow_string_paths -def serialize_openmm_pdb(pdb_path : Path, topology : OpenMMTopology, positions : Union[NDArray, list[Vec3]], keep_chain_and_res_ids : bool=True, - uniquify_atom_ids : bool=True, num_atom_id_digits : int=2, resname_repl : Optional[dict[str, str]]=None) -> None: +def serialize_openmm_pdb( + pdb_path : Path, + topology : OpenMMTopology, + positions : Union[NDArray, list[Vec3]], + keep_chain_and_res_ids : bool=True, + atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(), + resname_map : Optional[dict[str, str]]=None, + ) -> None: '''Configure and write an Protein DataBank File from an OpenMM Topology and array of positions Provides options to configure atom ID numbering, residue numbering, and residue naming''' - if resname_repl is None: - resname_repl = {} # avoids mutable default + if resname_map is None: + resname_map = {} # avoids mutable default # chain config for chain in topology.chains(): @@ -133,18 +140,14 @@ def serialize_openmm_pdb(pdb_path : Path, topology : OpenMMTopology, positions : # residue config for residue in topology.residues(): residue.id = str(residue.id) # avoids TypeError when specifying keepIds during PDB write - repl_res_name = resname_repl.get(residue.name, None) # lookup current residue name to see if a replacement is called for + repl_res_name = resname_map.get(residue.name, None) # lookup current residue name to see if a replacement is called for if repl_res_name is not None: residue.name = repl_res_name # individual atom config - element_counter = Counter() # for keeping track of the running index of each distinct element - could be used to produce a Hill formula - for atom in topology.atoms(): - symbol = atom.element.symbol - atom_id = element_counter[symbol] - if uniquify_atom_ids: - atom.name = f'{symbol}{atom_id:0{num_atom_id_digits}d}' # extend atom name with ordered integer with specified number of digits (including leading zeros) - element_counter[symbol] += 1 + if atom_labeller: # implicitly, preserves extant atom names if a labeller is not given + for atom in topology.atoms(): + atom.name = atom_labeller.get_atom_label(atom.element.symbol) # file write with pdb_path.open('w') as file: diff --git a/polymerist/molfiles/__init__.py b/polymerist/molfiles/__init__.py new file mode 100644 index 0000000..314438a --- /dev/null +++ b/polymerist/molfiles/__init__.py @@ -0,0 +1,4 @@ +'''Utilities for reading from and writing to various molecular file formats''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' \ No newline at end of file diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py new file mode 100644 index 0000000..1852423 --- /dev/null +++ b/polymerist/molfiles/pdb.py @@ -0,0 +1,75 @@ +'''PDB file formatting tools''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +from dataclasses import dataclass, field +from collections import Counter + + +@dataclass(frozen=True) +class SerialAtomLabeller: + ''' + For assigning unique numbered atom names based on their + order of appearance within a molecule and elemental class + + Useful, for example, in generating unique atom names for a PDB file + + Parameters + ---------- + atom_label_width : int , default 4 + Exact length alloted for any generated atom label + Labels shorter than this are right-padded with spaces, + while labels longer than this are truncated + + Default of 4 is the chosen to be compatible with the PDB specification ("Atom name: lines 13-16, left-justified") + https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html + include_elem_idx : bool, default True + Whether to attach a numerical element-index postfix to atom labels + + E.g. with atom_label_width=4, the fifth carbon in a topology + will be labelled as "C004" with include_elem_idx=True, + while labelled as "C " with include_elem_idx=False, + default_elem_idx : int, default 0 + Starting index for each element category + By default, is 0-indexed; MUST BE POSITIVE + ''' + atom_label_width : int = 4 + include_elem_idx : bool = True + default_elem_idx : int = 0 + + element_counter : Counter = field(init=False, default_factory=Counter) + + def __post_init__(self) -> None: + '''Check ranges on input values''' + if self.atom_label_width < 0: + raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.atom_label_width})') + + if self.default_elem_idx < 0: + raise ValueError(f'Must provide a non-negative starting index for element indices (provided {self.default_elem_idx})') + + def get_atom_label(self, elem_symbol : str) -> str: + ''' + Obtain a numbered atom label for an atom based on its element, + updating the underlying element context in the process + ''' + if not isinstance(elem_symbol, str): + raise TypeError(f'Must pass symbol of atom\'s element as str (not type {type(elem_symbol).__name__})') + + if elem_symbol not in self.element_counter: # initialize first occurence to starting value + self.element_counter[elem_symbol] = self.default_elem_idx + + atom_idx_label : str = '' + if self.include_elem_idx: + atom_idx = self.element_counter[elem_symbol] + num_idx_digits = max(self.atom_label_width - len(elem_symbol), 0) # number of symbols left over for an atom index + atom_idx_label = f'{atom_idx:0{num_idx_digits}d}' + + atom_name = f'{elem_symbol}{atom_idx_label}' + atom_name = atom_name.ljust(self.atom_label_width, ' ')[:self.atom_label_width] # pad with spaces if too short, or truncate if too long + assert(len(atom_name) <= self.atom_label_width) # perfunctory check to make sure things are working as expected + + self.element_counter[elem_symbol] += 1 # update tally with addition of new occurence of a particular element + + return atom_name + \ No newline at end of file diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py deleted file mode 100644 index e7898c3..0000000 --- a/polymerist/polymers/building.py +++ /dev/null @@ -1,111 +0,0 @@ -'''Utilities for building new polymer structures; currently limited to linear polymers and PDB save format''' - -__author__ = 'Timotej Bernat' -__email__ = 'timotej.bernat@colorado.edu' - -import logging -LOGGER = logging.getLogger(__name__) - -import warnings -with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings - warnings.filterwarnings('ignore', category=DeprecationWarning) - import mbuild as mb - from mbuild import Compound - from mbuild.lib.recipes.polymer import Polymer as MBPolymer - -from pathlib import Path -from rdkit import Chem - -from .exceptions import MorphologyError -from .estimation import estimate_chain_len_linear -from ..polymers.monomers.repr import MonomerGroup -from ..polymers.monomers.specification import SANITIZE_AS_KEKULE - -from ..genutils.decorators.functional import allow_string_paths -from ..rdutils.bonding.portlib import get_linker_ids -from ..rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports -from ..mdtools.openmmtools.serialization import serialize_openmm_pdb - - -# CONVERSION -def mbmol_from_mono_rdmol(rdmol : Chem.Mol) -> tuple[Compound, list[int]]: - '''Accepts a monomer-spec-compliant SMARTS string and returns an mbuild Compound and a list of the indices of atom ports''' - linker_ids = [i for i in get_linker_ids(rdmol)] # record indices of ports - MUST unpack generator for mbuild compatibility - - # create port-free version of molecule which RDKit can embed without errors - prot_mol = hydrogenate_rdmol_ports(rdmol, in_place=False) - # prot_mol = saturate_ports(rdmol) # TOSELF : custom, port-based saturation methods are not yet ready for deployment - yield issues in RDKit representation under-the-hood - Chem.SanitizeMol(prot_mol, sanitizeOps=SANITIZE_AS_KEKULE) # ensure Mol is valid (avoids implicitValence issues) - mb_compound = mb.conversion.from_rdkit(prot_mol) # native from_rdkit() method actually appears to preserve atom ordering - - return mb_compound, linker_ids - -@allow_string_paths -def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int=2, res_repl : dict[str, str]=None) -> None: - '''Save an MBuild Compound into an OpenMM-compatible PDB file''' - if res_repl is None: # avoid mutable default - res_repl = {'RES' : 'Pol'} - - traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format) - omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory - - serialize_openmm_pdb( - pdb_path, - topology=omm_top, - positions=omm_pos, - uniquify_atom_ids=True, - num_atom_id_digits=num_atom_digits, - resname_repl=res_repl - ) - - -# LINEAR POLYMER BUILDING -def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A', add_Hs : bool=False, energy_minimize : bool=False) -> MBPolymer: - '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON) - and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object''' - # 0) VERIFY THAT CHAIN ACTUAL CAN DEFINE LINEAR POLYMER - if not monomers.is_linear: - raise MorphologyError('Linear polymer building does not support non-linear monomer input') - - if monomers.has_valid_linear_term_orient: - term_orient = monomers.term_orient - LOGGER.info(f'Using pre-defined terminal group orientation {term_orient}') - else: - term_orient = { - resname : orient - for (resname, rdmol), orient in zip(monomers.iter_rdmols(term_only=True), ['head', 'tail']) # will raise StopIteration if fewer - } - LOGGER.warning(f'No valid terminal monomer orientations defined; autogenerated orientations "{term_orient}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!') - - # 1) ADD MIDDLE MONOMERS TO CHAIN - chain = MBPolymer() - for (resname, middle_monomer), sequence_key in zip(monomers.iter_rdmols(term_only=False), sequence): # zip with sequence limits number of middle monomers to length of block sequence - LOGGER.info(f'Registering middle monomer {resname} (block identifier "{sequence_key}")') - mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer) - chain.add_monomer(compound=mb_monomer, indices=linker_ids) - - # 2) ADD TERMINAL MONOMERS TO CHAIN - term_iters = { # need to convert to iterators to allow for generator-like advancement (required for term group selection to behave as expected) - resname : iter(rdmol_list) # made necessary by annoying list-bound structure of current substructure spec - for resname, rdmol_list in monomers.rdmols(term_only=True).items() - } - for resname, head_or_tail in term_orient.items(): - LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")') - term_monomer = next(term_iters[resname]) - mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer) - chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation - - # 3) ASSEMBLE AND RETURN CHAIN - n_atoms = estimate_chain_len_linear(monomers, DOP) - LOGGER.info(f'Assembling linear polymer chain with {DOP} monomers ({n_atoms} atoms)') - chain.build(DOP - 2, sequence=sequence, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers) - for atom in chain.particles(): - atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings) - LOGGER.info(f'Successfully assembled linear polymer chain with {DOP} monomers ({n_atoms} atoms)') - - if energy_minimize: - LOGGER.info('Energy-minimizing chain to find more stabile conformer') - chain.energy_minimize() - LOGGER.info('Energy minimization completed') - - return chain \ No newline at end of file diff --git a/polymerist/polymers/building/__init__.py b/polymerist/polymers/building/__init__.py new file mode 100644 index 0000000..dfe3e3d --- /dev/null +++ b/polymerist/polymers/building/__init__.py @@ -0,0 +1,21 @@ +''' +Tools for building polymer conformers out of monomer SMARTS fragments +Currently restricted to building linear homopolymers and periodic block copolymers +''' + +from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage + +if not modules_installed('mbuild'): + raise MissingPrerequisitePackage( + importing_package_name=__spec__.name, + use_case='Polymer building', + install_link='https://mbuild.mosdef.org/en/stable/getting_started/installation/installation.html', + dependency_name='mbuild', + dependency_name_formal='mBuild', + ) + +from .linear import build_linear_polymer +from .mbconvert import ( + mbmol_from_mono_rdmol, mbmol_to_rdmol, + mbmol_to_openmm_pdb, mbmol_to_rdkit_pdb, +) \ No newline at end of file diff --git a/polymerist/polymers/building/linear.py b/polymerist/polymers/building/linear.py new file mode 100644 index 0000000..dab50f4 --- /dev/null +++ b/polymerist/polymers/building/linear.py @@ -0,0 +1,88 @@ +'''For generating linear polymer structure from monomer, sequence, and chain length information''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +import logging +LOGGER = logging.getLogger(__name__) + +import warnings +with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings + warnings.filterwarnings('ignore', category=DeprecationWarning) + from mbuild.lib.recipes.polymer import Polymer as MBPolymer + +from .mbconvert import mbmol_from_mono_rdmol +from .sequencing import LinearCopolymerSequencer +from ..exceptions import MorphologyError +from ..monomers.repr import MonomerGroup +from ..estimation import estimate_n_atoms_linear +from ...genutils.textual.substrings import unique_string + + +def build_linear_polymer( + monomers : MonomerGroup, + n_monomers : int, + sequence : str='A', + minimize_sequence : bool=True, + allow_partial_sequences : bool=False, + add_Hs : bool=False, + energy_minimize : bool=False, + ) -> MBPolymer: + '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON) + and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object''' + # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE) - DEV: consider making a separate function + end_groups = monomers.linear_end_groups() # cache end groups so they dont need to be recalculated when registering end groups + end_group_names = [resname for (resname, _) in end_groups.values()] + + sequencer = LinearCopolymerSequencer( + sequence_kernel=sequence, + n_repeat_units=n_monomers, + n_repeat_units_terminal=len(end_groups) + ) + if minimize_sequence: + sequencer.reduce() # identify minimal subsequences + + sequence_compliant, n_seq_repeats = sequencer.procrustean_alignment(allow_partial_sequences=allow_partial_sequences) + LOGGER.info( + f'Target chain length achievable with {sequencer.describe_tally()}, ' \ + f'namely with the sequence {sequencer.describe_order(end_group_names=end_group_names)}' + ) + sequence_unique = unique_string(sequence_compliant, preserve_order=True) # only register a new monomer for each appearance of a new, unique symbol in the sequence + + # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY + polymer = MBPolymer() + monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building + + ## 2A) ADD MIDDLE MONOMERS TO CHAIN + for symbol, (resname, middle_monomer) in zip(sequence_unique, monomers.iter_rdmols(term_only=False)): # zip with sequence limits number of middle monomers to length of block sequence + LOGGER.info(f'Registering middle monomer {resname} (block identifier "{symbol}")') + mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname) + polymer.add_monomer(compound=mb_monomer, indices=linker_ids) + monomers_selected.monomers[resname] = monomers.monomers[resname] + + ## 2B) ADD TERMINAL MONOMERS TO CHAIN + for head_or_tail, (resname, term_monomer) in end_groups.items(): + LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")') + mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer, resname=resname) + polymer.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation + monomers_selected.monomers[resname] = monomers.monomers[resname] + + # 3) ASSEMBLE AND RETURN CHAIN + if not monomers_selected.is_linear: # verify the selected monomers actually define a linear polymer + raise MorphologyError('Linear polymer building does not support non-linear monomer input') + + n_atoms_est = estimate_n_atoms_linear(monomers_selected, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy + LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)') + + polymer.build(n_seq_repeats, sequence=sequence_compliant, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers) + for atom in polymer.particles(): + atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings) + LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {polymer.n_particles} atoms)') + + # 4) OPTIONALLY, PERFORM FINAL UFF ENERGY MINIMIZATION + if energy_minimize: + LOGGER.info('Energy-minimizing chain to find more stable conformer') + polymer.energy_minimize() + LOGGER.info('Energy minimization completed') + + return polymer \ No newline at end of file diff --git a/polymerist/polymers/building/mbconvert.py b/polymerist/polymers/building/mbconvert.py new file mode 100644 index 0000000..61656d9 --- /dev/null +++ b/polymerist/polymers/building/mbconvert.py @@ -0,0 +1,135 @@ +''' +Enhanced conversions to and from mbuild Compound objects which +preserve more molecular information than the utilities provided by +''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage + +if not modules_installed('openbabel'): + raise MissingPrerequisitePackage( + importing_package_name=__spec__.name, + use_case='Translation between chemical representations of polymers', + install_link='https://libraries.io/conda/openbabel', + dependency_name='openbabel', + dependency_name_formal='the OpenBabel chemical toolbox', + ) + +from typing import Optional +from pathlib import Path + +import warnings +with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings + warnings.filterwarnings('ignore', category=DeprecationWarning) + from mbuild import Compound + from mbuild.conversion import from_rdkit + +from rdkit import Chem + +from ...genutils.decorators.functional import allow_string_paths, allow_pathlib_paths +from ..monomers.specification import SANITIZE_AS_KEKULE +from ...molfiles.pdb import SerialAtomLabeller +from ...rdutils.bonding.portlib import get_linker_ids +from ...rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports +from ...mdtools.openmmtools.serialization import serialize_openmm_pdb + + +# Conversion from other formats to Compound +def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tuple[Compound, list[int]]: + ''' + Accepts a monomer-spec-compliant SMARTS string and returns an mbuild Compound and a list of the indices of atom ports + If "resname" is provided, will assign that name to the mBuild Compound returned + ''' + linker_ids = [i for i in get_linker_ids(rdmol)] # record indices of ports - MUST unpack generator for mbuild compatibility + + # create port-free version of molecule which RDKit can embed without errors + prot_mol = hydrogenate_rdmol_ports(rdmol, in_place=False) + # prot_mol = saturate_ports(rdmol) # TOSELF : custom, port-based saturation methods are not yet ready for deployment - yield issues in RDKit representation under-the-hood + Chem.SanitizeMol(prot_mol, sanitizeOps=SANITIZE_AS_KEKULE) # ensure Mol is valid (avoids implicitValence issues) + + mb_compound = from_rdkit(prot_mol) # native from_rdkit() method actually appears to preserve atom ordering + if resname is not None: + mb_compound.name = resname + + return mb_compound, linker_ids + +# Conversion from Compound to other formats +_DEFAULT_RESNAME_MAP : dict[str, str] = { # module-wide config for default PDB residue name replacements for polymers + 'RES' : 'Pol', +} + +def mbmol_to_rdmol( # TODO: deduplify PDB atom name and residue numbering code against serialize_openmm_pdb() + mbmol : Compound, + atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(), + resname_map : Optional[dict[str, str]]=None + ) -> Chem.Mol: + '''Convert an mBuild Compound into an RDKit Mol, with correct atom coordinates and PDB residue info''' + if resname_map is None: + resname_map = _DEFAULT_RESNAME_MAP + + rdmol = mbmol.to_rdkit() + conformer = Chem.Conformer() + conformer.Set3D(True) + + atom_id : int = 0 + for resnum, mb_monomer in enumerate(mbmol.children, start=1): + resname = resname_map.get(mb_monomer.name, mb_monomer.name[:3]) # if no remapping is found, just take first 3 chars + # NOTE: the order of monomers and atoms within those monomers were added in the same order as iterated over here... + #... so the atom indices **SHOULD** be in the correct order (hate that this even might be uncertain) + for mbatom in mb_monomer.particles(): + conformer.SetAtomPosition(atom_id, 10*mbatom.pos.astype(float)) # conveert from nm to angstrom + + # set PDB residue info if monomer hierarchy is present + if mbatom != mb_monomer: # for Compounds with a flat hierarchy, the children and particles of children will coincide + pdb_info = Chem.AtomPDBResidueInfo( + atomName=4*' ' if not atom_labeller else atom_labeller.get_atom_label(mbatom.element.symbol), + residueName=resname, + residueNumber=resnum, + chainId='1', + isHeteroAtom=True, + ) + rdmol.GetAtomWithIdx(atom_id).SetPDBResidueInfo(pdb_info) + atom_id += 1 # TODO: this is an awful waay of keeping track of atom indices, see if there's a more secure way to do this + conf_id = rdmol.AddConformer(conformer) # NOTE: recording this to self-document return values (this is intentionally not used) + + return rdmol + +# Serialization of Compounds to files +@allow_pathlib_paths +def mbmol_to_rdkit_pdb( + pdb_path : str, + mbmol : Compound, + atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(), + resname_map : Optional[dict[str, str]]=None, + ) -> None: + '''Save an MBuild Compound into an RDKit-formatted PDB file''' + Chem.MolToPDBFile( + mbmol_to_rdmol(mbmol, atom_labeller=atom_labeller, resname_map=resname_map), + pdb_path, + ) + +@allow_string_paths +def mbmol_to_openmm_pdb( + pdb_path : Path, + mbmol : Compound, + atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(), + resname_map : Optional[dict[str, str]]=None, + ) -> None: + '''Save an MBuild Compound into an OpenMM-formatted PDB file''' + if resname_map is None: # avoid mutable default + resname_map = _DEFAULT_RESNAME_MAP + + # NOTE: converting through MDTraj first before going to OpenMM preserves much + # of the necessary chemical info that is discarded when converting through other formats + traj = mbmol.to_trajectory(residues=[residue.name for residue in mbmol.children]) # extract names of repeat units + omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory + + serialize_openmm_pdb( + pdb_path, + topology=omm_top, + positions=omm_pos, + atom_labeller=atom_labeller, + resname_map=resname_map, + ) \ No newline at end of file diff --git a/polymerist/polymers/building/sequencing.py b/polymerist/polymers/building/sequencing.py new file mode 100644 index 0000000..cd52c69 --- /dev/null +++ b/polymerist/polymers/building/sequencing.py @@ -0,0 +1,199 @@ +'''For generating and manipulating sequences of symbols which correspond to monomer ordering in blocky and random copolymers''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +import logging +LOGGER = logging.getLogger(__name__) + +from typing import Iterable, Optional +from dataclasses import dataclass, field, asdict + +from ...genutils.textual.substrings import shortest_repeating_substring, repeat_string_to_length +from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable +from ..exceptions import EndGroupDominatedChain, InsufficientChainLength, EmptyBlockSequence, PartialBlockSequence + + +@make_jsonifiable +@dataclass +class LinearCopolymerSequencer: + ''' + For encapsulating information about the sequence of repeat units in a periodic, linear copolymer + Also covers, as trivial special cases, homopolymers and alternating copolymers + + Parameters + ---------- + sequence_kernel : str + A sequence indicating a periodic ordering of monomers in a linear polymer block (e.g. "A", "ABAC", etc) + Each unique symbol in the sequence corresponds to a distinct monomer in the block + n_repeat_units : int + The desired total number of monomers (including terminal monomers) in a polymer chain + n_monomers_terminal : int + The number of terminal monomers ("end groups") which are to be included in the chain + in addition to the middle monomers described by "sequence" + + Raises + ------ + EmpyBlockSequence + The sequence provided is empty (can't be used to define nonzero-length chain) + End GroupDominatedChain + The number of terminal monomers exceed the number of total monomers + ''' + sequence_kernel : str + n_repeat_units : int + n_repeat_units_terminal : int = 0 + + # Attribute checks and modifications + def __post_init__(self) -> None: + if not self.sequence_kernel: + raise EmptyBlockSequence('Must provide non-empty sequence kernel to yield a valid (co)polymer sequence') + + if self.n_repeat_units_middle < 0: + raise EndGroupDominatedChain( + f'Number of terminal monomers exceeds requested chain length; ({self.n_repeat_units}-mer ' \ + f'chain can\'t possibly contain {self.n_repeat_units_terminal} terminal monomers)' + ) + + def copy(self) -> 'LinearCopolymerSequencer': + '''Returns another equivalent instance of the current sequence info more efficiently than a complete deepcopy''' + return self.__class__(**asdict(self)) + + def reduce(self) -> None: + ''' + Determines if there is a shorter repeating subsequence making up the current sequence kernel + If there is, adjusts the sequence kernel to that minimal sequence; does nothing otherwise + + Reduction is idempotent, and guarantees that the smallest possible kernel is used when sequencing + ''' + minimal_subsequence = shortest_repeating_substring(self.sequence_kernel) + kernel_period = self.block_size // len(minimal_subsequence) # account for any periodic shortening WITHIN the kernel + + if kernel_period == 1: + LOGGER.info(f'Sequence kernel "{self.sequence_kernel}" is already fully reduced; no changes made') + return + else: + LOGGER.info( + f'Sequence kernel "{self.sequence_kernel}" can be further decomposed as {kernel_period}*"{minimal_subsequence}"; ' \ + f'Setting kernel to minimal subsequence "{minimal_subsequence}"' + ) + self.sequence_kernel = minimal_subsequence + + def reduced(self) -> 'LinearCopolymerSequencer': + '''Return a sequence-reduced version of the current sequence info''' + clone = self.copy() + clone.reduce() + + return clone + + # Properties derived from sequence kernel and target chain lengths + + @property + def n_repeat_units_middle(self) -> int: + '''Number of middle (i.e. non-terminal) repeat units''' + return self.n_repeat_units - self.n_repeat_units_terminal + + # Whole sequence periods + @property + def block_size(self) -> int: + '''Number of repeat units units in one whole iteration of the kernel block''' + return len(self.sequence_kernel) + period = block_size + + @property + def n_full_periods(self) -> int: + ''' + Largest number of complete repetitions of the sequence kernel which, when taken + together, contain no more repeats units than the specified number of middle units + ''' + return self.n_repeat_units_middle // self.block_size + + # Partial sequence residues + @property + def n_residual_repeat_units(self) -> int: + ''' + Difference between number of middle repeat units and units which + would occur in maximal full periods of the kernel + + By construction, is no greater than the block size and is + identically zero exactly when a whole number of kernel repeats + ''' + return self.n_repeat_units_middle % self.block_size + n_residual_symbols = n_res = n_residual_repeat_units + + @property + def has_residual(self) -> bool: + ''' + Whether or not the target number of middle repeat units + can be attained by a whole number of kernel repeats + ''' + return bool(self.n_residual_repeat_units) + + @property + def sequence_residual(self) -> str: + '''Partial repeat of the kernel sequence needed to attain the speficied number of middle units''' + return self.sequence_kernel[:self.n_residual_repeat_units] + residual = sequence_residual + + ## PROCRUSTEAN sequence alignment + def procrustean_alignment(self, allow_partial_sequences : bool=False) -> tuple[str, int]: + ''' + PROCRUSTEAN: Periodic Recurrence Of Cyclic Repeat Unit Sequences, Truncated to an Exact and Arbitrary Number + Stretches or truncates the sequence kernel to achieve a target sequence length, cycling through the kernel's period as many times as needed + + Algorithm produces a sequence string "P" and number of repeats "r" which, taken together, satisfy the following: + - The number of units in r repeats of P plus the number of terminal monomers is precisely equal to the target number of monomers + - The units in P cycle through the units in S, in the order they appear in S + - The number of times S is cycled through in P is always a rational multiple of the length of S + If no satisfiable sequence-count pair can be found, raises an appropriate informative exception + ''' + if not self.has_residual: # the case where the target length happens to consist of a whole-number of repeats of the kernel + if self.n_full_periods < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced my hand + raise InsufficientChainLength( + f'{self.n_repeat_units}-monomer chain cannot accomodate both {self.n_repeat_units_terminal} end groups AND at least 1 middle monomer sequence' + ) + sequence_procrustean = self.sequence_kernel + n_seq_repeats = self.n_full_periods + else: + if not allow_partial_sequences: + raise PartialBlockSequence( + f'Partial polymer block sequence required to meet target number of monomers ("{self.residual}" prefix of sequence "{self.sequence_kernel}");\n' \ + 'If this is acceptable, set "allow_partial_sequences=True" and try calling build routine again' + ) + sequence_procrustean = repeat_string_to_length(self.sequence_kernel, target_length=self.n_repeat_units_middle, joiner='') + n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit) + + return sequence_procrustean, n_seq_repeats + + def describe_order(self, end_group_names : Optional[Iterable[str]]=None, default_end_group_name : str='END-GROUP') -> str: + '''Descriptive string presenting a condensed view of the order of repeat units in the final sequence''' + # Assign names for end groups + if end_group_names is None: + end_group_names = [f'[{default_end_group_name}]']*self.n_repeat_units_terminal + else: + end_group_names = [f'[{end_group_name}]' for end_group_name in end_group_names] # unpack into list and enforce correct number of names + if (num_names_provided := len(end_group_names)) != self.n_repeat_units_terminal: # DEV: consider supporting filling in missing names with default in future + raise IndexError(f'Defined sequence info with {self.n_repeat_units_terminal} end groups, but only provided names for {num_names_provided}') + + # Insert middle omnomer parts as necessary + sequence_middle = [] + if self.n_full_periods != 0: ## Whole sequence strings + sequence_middle.append(f'{self.n_full_periods}*[{self.sequence_kernel}]') + if self.has_residual: ## Partial sequence strings + sequence_middle.append(f'[{self.residual}]') + + # Abut with correct amount of end group indicators + sequence_parts = end_group_names[:] + sequence_parts[1:-1] = sequence_middle + + return ' + '.join(sequence_parts) + + def describe_tally(self) -> str: + '''Descriptive string indicating how all parts of the overall sequence contribute to the target number of repeat units''' + desc_seq_counts_parts = [] + if self.n_full_periods != 0: ## Whole sequence strings + desc_seq_counts_parts.append(f'{self.n_full_periods} whole {self.block_size}-sequence repeat(s)') + if self.has_residual: ## Partial sequence strings + desc_seq_counts_parts.append(f'a partial {self.n_residual_repeat_units}/{self.block_size} sequence repeat') + + return ' and '.join(desc_seq_counts_parts) + \ No newline at end of file diff --git a/polymerist/polymers/estimation.py b/polymerist/polymers/estimation.py index fa56799..b075888 100644 --- a/polymerist/polymers/estimation.py +++ b/polymerist/polymers/estimation.py @@ -4,18 +4,16 @@ __email__ = 'timotej.bernat@colorado.edu' import numpy as np -from rdkit import Chem -from .exceptions import InsufficientChainLengthError -from ..genutils.iteration import iter_len +from .exceptions import InsufficientChainLength from ..polymers.monomers.repr import MonomerGroup from ..rdutils.bonding.portlib import get_num_ports -def estimate_chain_len_linear(monomers : MonomerGroup, DOP : int) -> int: +def estimate_n_atoms_linear(monomers : MonomerGroup, n_monomers : int) -> int: '''Given a set of monomers and the desired degree of polymerization, estimate the length of the resulting chain !NOTE! : As-implemented, only works for linear homopolymers and block copolymers with equal an distribution of monomers''' - # TOSELF : omitted logging for now, as it gets repeated on EVERY cycle in when called estimate_DOP_lower + # TOSELF : omitted logging for now, as it gets repeated on EVERY cycle in when called estimate_n_monomers_supremum() num_mono = monomers.n_monomers mono_term = np.zeros(num_mono, dtype=bool) # terminality of each monomer (i.e. whether or not it is a term group) mono_multip = np.zeros(num_mono, dtype=int) # multiplicity of each polymer (i.e. how many times is occurs in a chain) @@ -32,27 +30,30 @@ def estimate_chain_len_linear(monomers : MonomerGroup, DOP : int) -> int: num_term = sum(mono_term) num_mid = num_mono - num_term # assumed that all monomers are either terminal or not - mono_multip[~mono_term] = (DOP - num_term) / num_mid # naive assumption that all middle monomers contribute rest of chain equally (for homopolymers, this is always true) + mono_multip[~mono_term] = (n_monomers - num_term) / num_mid # naive assumption that all middle monomers contribute rest of chain equally (for homopolymers, this is always true) N = mono_contrib @ mono_multip # compute dot product to yield final count return N -def estimate_DOP_lower(monomers : MonomerGroup, max_chain_len : int, min_DOP : int=3) -> int: - '''Returns the largest DOP for a set of monomers which yields a chain no longer than the specified chain length''' - base_chain_len = estimate_chain_len_linear(monomers, min_DOP) - if base_chain_len > max_chain_len: # pre-check when optimization is impossible - raise InsufficientChainLengthError(f'Even shortest possible chain (DOP={min_DOP}, N={base_chain_len}) is longer than the specified max length of {max_chain_len} atoms') - - DOP = min_DOP - while estimate_chain_len_linear(monomers, DOP + 1) < max_chain_len: # check if adding 1 more monomer keeps the length below the threshold - DOP += 1 - - return DOP - -def estimate_DOP_upper(monomers : MonomerGroup, min_chain_len : int, min_DOP : int=3) -> int: # NOTE : as currently defined, this also subsumes the case when the estimate and calculated length are exactly equal - '''Returns the smallest DOP for a set of monomers which yields a chain no shorter than the specified chain length''' - return estimate_DOP_lower(monomers, min_chain_len, min_DOP=min_DOP) + 1 # by definition, this is just 1 monomer longer than the lower bound - -estimate_DOP_infimum = estimate_DOP_upper # more descriptive aliases to alleviate confusion (originals kept in for backwards compatibility) -estimate_DOP_supremum = estimate_DOP_lower # more descriptive aliases to alleviate confusion (originals kept in for backwards compatibility) \ No newline at end of file +def estimate_n_monomers_infimum(monomers : MonomerGroup, n_atoms_max : int, n_monomers_min : int=3) -> int: + ''' + For a given collection of monomer fragments, returns the largest number of monomers which guarantees that + a polymer chain made up of those monomers will have no more than the specified maximum number of atoms + ''' + n_atoms_base = estimate_n_atoms_linear(monomers, n_monomers_min) + if n_atoms_base > n_atoms_max: # pre-check when optimization is impossible + raise InsufficientChainLength(f'Even shortest possible chain ({n_monomers_min} monomers, with {n_atoms_base} atoms) is longer than the specified max length of {n_atoms_max} atoms') + + n_monomers = n_monomers_min + while estimate_n_atoms_linear(monomers, n_monomers + 1) < n_atoms_max: # check if adding 1 more monomer keeps the length below the threshold + n_monomers += 1 + + return n_monomers + +def estimate_n_monomers_supremum(monomers : MonomerGroup, n_atoms_min : int, n_monomers_min : int=3) -> int: # NOTE : as currently defined, this also subsumes the case when the estimate and calculated length are exactly equal + ''' + For a given collection of monomer fragments, returns the smallest number of monomers which guarantees that + a polymer chain made up of those monomers will have no fewer than the specified minimum number of atoms + ''' + return estimate_n_monomers_infimum(monomers, n_atoms_min, n_monomers_min=n_monomers_min) + 1 # by definition, a ny more monomers than the infimum guarantees the chain will surpass a given number of atoms \ No newline at end of file diff --git a/polymerist/polymers/exceptions.py b/polymerist/polymers/exceptions.py index e7b5625..e0060f9 100644 --- a/polymerist/polymers/exceptions.py +++ b/polymerist/polymers/exceptions.py @@ -3,51 +3,44 @@ __author__ = 'Timotej Bernat' __email__ = 'timotej.bernat@colorado.edu' - -class SubstructMatchFailedError(Exception): - '''Raised when molecule graph isomorphism match does not form a cover''' - pass - -class InsufficientChainLengthError(Exception): +# CHAIN LENGTH AND SHAPE ERRORS +class InsufficientChainLength(Exception): '''Raised when the polymer molecule being built is too short''' pass -class ExcessiveChainLengthError(Exception): +class ExcessiveChainLength(Exception): '''Raised when the polymer molecule being built is too long''' pass +class EndGroupDominatedChain(Exception): + '''Raised to indicate there are more end groups present in a chain than are monomer possibly allowed''' + class MorphologyError(Exception): '''Raised when a polymer does not have the morphology (i.e. crosslinking, molecular weight, etc) an application expects''' pass -class AlreadySolvatedError(Exception): - '''Raised when attempting to add solvent to a molecule which already has solvent''' +# COPOLYMER SEQUENCING ERRORS +class EmptyBlockSequence(Exception): + '''Raised when a trivial sequence of copolymer block (i.e. the empty string "") is provided when no expected''' pass -class ChargeMismatchError(Exception): - '''Raised when attempting to merge two objects which disagree on their charging status''' +class PartialBlockSequence(Exception): + '''Raised when an non-whole number of copolymer blocks is needed to reach a target chain length (and is not allowed)''' pass -class NoSimulationsFoundError(Exception): - '''Raised when attempting to load a simulation for a managed molecule when none are present''' +# POLYMERIZATION MISINFORMATION ERRORS +class AlreadySolvated(Exception): + '''Raised when attempting to add solvent to a molecule which already has solvent''' pass -class MissingStructureData(Exception): - '''Raised when a managed molecule has no associated structure file (e.g. PDB, SDF, etc.)''' +class ChargeMismatch(Exception): + '''Raised when attempting to merge two objects which disagree on their charging status''' pass -class MissingForceFieldData(Exception): - '''Raised when a forcefield is unspecified for a Simulation or Interchange''' +class MissingStructureData(Exception): + '''Raised when a managed molecule has no associated structure file (e.g. PDB, SDF, etc.)''' pass class MissingMonomerData(Exception): - '''Raised when no monomer information is found for a Polymer''' - pass - -class MissingMonomerDataUncharged(MissingMonomerData): - '''Raised when no monomer information WITHOUT library charges is found for a Polymer''' - pass - -class MissingMonomerDataCharged(MissingMonomerData): - '''Raised when no monomer information WITH library charges is found for a Polymer''' + '''Raised when no monomer fragment information is found for a Polymer''' pass diff --git a/polymerist/polymers/monographs.py b/polymerist/polymers/monographs.py index 28f6fcd..8c9b591 100644 --- a/polymerist/polymers/monographs.py +++ b/polymerist/polymers/monographs.py @@ -38,7 +38,6 @@ def get_flavor_dict_at_node_index(self, node_idx : int) -> Optional[dict[int, in def num_monomers(self) -> int: '''Number of monomer units represented in the current polymer''' return self.number_of_nodes() - DOP = num_monomers @property def is_unbranched(self) -> bool: diff --git a/polymerist/polymers/monomers/__init__.py b/polymerist/polymers/monomers/__init__.py index 176a2ff..7e50795 100644 --- a/polymerist/polymers/monomers/__init__.py +++ b/polymerist/polymers/monomers/__init__.py @@ -3,4 +3,9 @@ __author__ = 'Timotej Bernat' __email__ = 'timotej.bernat@colorado.edu' -from .repr import MonomerGroup # make monomer representation available at the module level \ No newline at end of file +from .repr import MonomerGroup # make monomer representation available at the module level +from .fragments import ( + PE_FRAGMENTS, + MPD_TMC_FRAGMENTS, + PEG_PLGA_FRAGMENTS, +) \ No newline at end of file diff --git a/polymerist/polymers/monomers/fragments.py b/polymerist/polymers/monomers/fragments.py new file mode 100644 index 0000000..a734e80 --- /dev/null +++ b/polymerist/polymers/monomers/fragments.py @@ -0,0 +1,32 @@ +'''Catalogue of monomer fragment templates for some common polymer systems''' + +PE_FRAGMENTS : dict[str, list[str]] = { + # PE (polyethylene) + 'PE1': ['[*:1]-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:6])(-[#1D1+0:7])-[#1D1+0:8])(-[#1D1+0:4])-[#1D1+0:5]'], + 'PE2': ['[*:1]-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:7])-[#1D1+0:8])(-[#1D1+0:5])-[#1D1+0:6]'], +} + +MPD_TMC_FRAGMENTS : dict[str, list[str]] = { # fragments for common polyamide membrane + # MPD (m-phenyl diamine) + 'MPD-1': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:11])-[#6D3+0:5](-[#1D1+0:12])=[#6D3+0:6](-[#1D1+0:13])-[#6D3+0:7](-[#1D1+0:14])=[#6D3+0:8]-1-[#7D3+0:9](-[#1D1+0:15])-[#1D1+0:16])-[#1D1+0:10]'], + 'MPD-2': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:12])-[#6D3+0:5](-[#1D1+0:13])=[#6D3+0:6](-[#1D1+0:14])-[#6D3+0:7](-[#1D1+0:15])=[#6D3+0:8]-1-[#7D3+0:9](-[*:10])-[#1D1+0:16])-[#1D1+0:11]'], + # TMC (trimesoyl chloride) + 'TMC-1': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[#17D1+0:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'], + 'TMC-2': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'], + 'TMC-3': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'], +} + +PEG_PLGA_FRAGMENTS : dict[str, list[str]] = { # fragments for all variants of PEG-PLGA-like polymers + # PEG (ethylene glycol) + 'PEG-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7])-[#1D1+0:5]'], + 'PEG-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#8D2+0:5]-[#1D1+0:10])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'], + 'PEG-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[*:5])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'], + # PLA (lactic acid) + 'PLA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:4](=[#8D1+0:5])-[*:6])-[#1D1+0:8])-[#1D1+0:7]'], + 'PLA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[#8D2+0:7]-[#1D1+0:12])-[#1D1+0:8]'], + 'PLA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[*:7])-[#1D1+0:8]'], + # PGA (glycolic acid) + 'PGA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]'], + 'PGA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]'], + 'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'], +} \ No newline at end of file diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py index f8ebf62..0aa0d8d 100644 --- a/polymerist/polymers/monomers/repr.py +++ b/polymerist/polymers/monomers/repr.py @@ -3,40 +3,100 @@ __author__ = 'Timotej Bernat' __email__ = 'timotej.bernat@colorado.edu' -from typing import Generator, Optional, TypeAlias, Union +import logging +LOGGER = logging.getLogger(__name__) + +from typing import Generator, Optional, Iterable, Union from dataclasses import dataclass, field +from itertools import cycle from collections import defaultdict + from rdkit import Chem -from rdkit.Chem.rdchem import Mol from ...genutils.iteration import iter_len from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable -from ...rdutils.bonding.portlib import get_num_ports +from ...smileslib.primitives import Smarts, is_valid_SMARTS +from ...rdutils.bonding.portlib import get_num_ports -ResidueSmarts : TypeAlias = dict[str, list[str]] # monomer SMARTS strings keyed by residue name # MAIN REPRESENTATION CLASS @make_jsonifiable @dataclass class MonomerGroup: '''Stores collections of residue-labelled monomer SMARTS''' - monomers : ResidueSmarts = field(default_factory=dict) - term_orient : dict[str, str] = field(default_factory=dict) - + monomers : dict[str, Union[Smarts, list[Smarts]]] = field(default_factory=dict) + term_orient : dict[str, str] = field(default_factory=dict) # keys are either "head" or "tail", values are the names of residues in "monomers" + + # MONOMER ADDITION AND VALIDATION + def __post_init__(self) -> None: + # Encase bare SMARTS into lists and check that all monomer SMARTS are valid + monomers_init = self.monomers # store inputted values + self.monomers = {} # clear monomers and re-add one-at-a-time + for resname, smarts in monomers_init.items(): + self.add_monomer(resname, smarts) + # DEV: opted to forgo term_orient check for now, as modifying this violates the read-only data model aimed for here + + def _add_monomer(self, resname : str, smarts : Smarts) -> None: + '''Add a new monomer to the templates already stored within, subject to validation checks''' + if not isinstance(smarts, str): + raise TypeError(f'Values of monomers must be either SMARTS strings or lists of SMARTS strings, not "{type(smarts).__name__}"') + # DEV: include check for empty string? (technically still a valid SMARTS string, but a pretty pathological one at that) + if not is_valid_SMARTS(smarts): + raise ValueError(f'Provided invalid monomer SMARTS string for {resname}: "{smarts}"') + # DEV: decide whether or not SMILES expansion and spec-compliance should be enforced here or shunted off to the user + + if resname in self.monomers: + existing_resgroup = self.monomers[resname] + if isinstance(existing_resgroup, list) and (smarts not in existing_resgroup): + LOGGER.info(f'Extending existing residue category "{resname}" with SMARTS {smarts}') + self.monomers[resname].append(smarts) + else: + LOGGER.info(f'Creating new residue category "{resname}", containing singular SMARTS ["{smarts}"])') + self.monomers[resname] = [smarts] + + def _add_monomers(self, resname : str, smarts_container : Iterable[Smarts]) -> None: + '''Add new monomers to the templates already stored within, subject to validation checks, from an iterable container''' + for smarts in smarts_container: + self._add_monomer(resname, smarts) + + def add_monomer(self, resname : str, smarts : Union[Smarts, Iterable[Smarts]]) -> None: + '''Register new monomers, either directly from SMARTS or from a container of SMARTS''' + if isinstance(smarts, Iterable) and not isinstance(smarts, str): # don;t want to insert one character at a time if a string is in fact provided + self._add_monomers(resname, smarts) + else: + self._add_monomer(resname, smarts) # assume any other inputs are singular values or strings + + # DUNDER "MAGIC" METHODS + def __getitem__(self, resname : str) -> str: + '''Convenience method to access .monomers directly from instance''' + return self.monomers[resname] # NOTE: deliberately avoid "get()" here to propagate KeyError + # BUG: user can directly append to the returned value to forgo monomer validation checks; + # this is not unit to __getitem__ but rather a consequence of thinly-wrapping builtin types + + def __setitem__(self, resname : str, smarts : Smarts) -> str: + '''Convenience method to access .monomers directly from instance''' + self.add_monomer(resname, smarts) + + def __hash__(self) -> int: + '''Hash based on monomer SMARTS and terminal orientation in a canonical order''' + # TOSELF: this is far from bulletproof, viz. canonicalzation of SMARTS, list value sorting, etc + return hash(f'{sorted(self.monomers.items())}{sorted(self.term_orient.items())}') + + # ATTRIBUTE PROPERTIES AND ALIASES @staticmethod - def is_terminal(monomer : Mol) -> bool: + def is_terminal(monomer : Chem.Mol) -> bool: '''Determine whether or not a monomer is terminal''' return get_num_ports(monomer) == 1 - - # ATTRIBUTE PROPERTIES AND ALIASES + @property - def SMARTS(self) -> ResidueSmarts: + def SMARTS(self) -> dict[str, list[Smarts]]: '''Alias of legacy "monomers" attribute''' return self.monomers # alias of legacy name for convenience - def iter_rdmols(self, term_only : Optional[bool]=None) -> Generator[tuple[str, Mol], None, None]: + # ITERATION OVER STORED MOLECULE FRAGMENTS + def iter_rdmols(self, term_only : Optional[bool]=None) -> Generator[tuple[str, Chem.Mol], None, None]: ''' Generate (residue name, RDKit Mol) pairs of all monomers present Simplifies iteration over internal lists of monomer Mols @@ -52,7 +112,7 @@ def iter_rdmols(self, term_only : Optional[bool]=None) -> Generator[tuple[str, M if (term_only is None) or (MonomerGroup.is_terminal(monomer) == term_only): yield (resname, monomer) - def rdmols(self, term_only : Optional[bool]=None) -> dict[str, list[Mol]]: + def rdmols(self, term_only : Optional[bool]=None) -> dict[str, list[Chem.Mol]]: ''' Returns dict of RDKit Mol lists keyed by residue name @@ -67,44 +127,66 @@ def rdmols(self, term_only : Optional[bool]=None) -> dict[str, list[Mol]]: return rdmol_dict + def contributions(self, term_only : Optional[bool]=None) -> dict[str, list[int]]: + '''Returns dict of the number of real (i.e. non-linker) atoms in each residue list''' + return { + resname : [mol.GetNumAtoms() - get_num_ports(mol) for mol in mol_list] + for resname, mol_list in self.rdmols(term_only=term_only).items() + } + @property def n_monomers(self) -> int: - '''Returns number of present monomers - Multiple monomers with the same residue name are considered distinct''' + '''Returns number of present monomers; multiple monomers under the same residue name are considered distinct''' return iter_len(self.iter_rdmols(term_only=None)) - # VALIDATION AND PROPERTY CHECKS - @property - def _is_valid(self) -> bool: - '''Check that types and formatting are correct''' - for resname, SMARTS_list in self.monomers.items(): - if not (isinstance(resname, str) and isinstance(SMARTS_list, list)): - return False - else: - return True # valid only if none of the SMARTS lists fail + # END GROUP DETERMINATION + def linear_end_groups(self) -> dict[str, tuple[str, Chem.Mol]]: + ''' + Returns head-and-tail end group residue names and Mol objects as defined by term_orient - @property - def has_valid_linear_term_orient(self) -> bool: - '''Check whether terminal group orientations are sufficient to define a linear polymer''' - return ( - bool(self.term_orient) # check that: 1) the term group orientations are non-empty / non-null... - and all(resname in self.monomers for resname in self.term_orient.keys()) # 2) all term group keys match a present monomer... - and sorted(self.term_orient.values()) == ['head', 'tail'] # 3) orientation labels are only "head" and "tail" (in either order) - ) + If term orient is undefined, will automatically take then first + <= 2 terminal groups available to be the end groups + + Returns + ------- + end_groups : dict[str, tuple[str, Chem.Mol]] + A dict whose keys are any of {'head', 'tail'} and whose + values are 2-tuples of residue names and Mols for the corresponding monomer + ''' + if self.term_orient and set(self.term_orient.keys()) == {'head', 'tail'}: + LOGGER.info(f'Using user-defined terminal group orientation {self.term_orient}') + monomer_iters = { + resname : cycle(smarts_list) + for resname, smarts_list in self.rdmols(term_only=True).items() + } # cycle handles degenerate end group case correctly + + return { + head_or_tail : (resname, next(monomer_iters[resname])) # will raise KeyError if any of the resnames are not present + for head_or_tail, resname in self.term_orient.items() + } + else: + term_orient_auto : dict[str, Smarts] = {} + end_groups_auto : dict[str, Chem.Mol] = {} + for head_or_tail, (resname, rdmol) in zip(['head', 'tail'], self.iter_rdmols(term_only=True)): # zip will bottom out early if fewer than 2 terminal monomers are present + term_orient_auto[head_or_tail] = resname # populate purely for logging + end_groups_auto[head_or_tail] = (resname, rdmol) + LOGGER.warning(f'No valid terminal monomer orientations defined, auto-assigned orientations "{term_orient_auto}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!') + + return end_groups_auto - # COMPOSITION AND I/O METHODS + # COMPOSITION METHODS def __add__(self, other : 'MonomerGroup') -> 'MonomerGroup': '''Content-aware method of merging multiple sets of monomer info via the addition operator''' cls = self.__class__ if not isinstance(other, cls): raise NotImplementedError(f'Can only merge {cls.__name__} with another {cls.__name__}, not object of type {type(other)}') - + # TODO: figure out how to handle combination of term group orientation gracefully (ignoring for now) return MonomerGroup(monomers={**self.monomers, **other.monomers}) __radd__ = __add__ # support reverse addition # CHEMICAL INFORMATION - def unique(self, cap_group : Union[str, Mol]=Chem.MolFromSmarts('[H]-[*]')) -> 'MonomerGroup': + def unique(self, cap_group : Union[Smarts, Chem.Mol]=Chem.MolFromSmarts('[H]-[*]')) -> 'MonomerGroup': '''Return a MonomerGroup containing only the unique monomers present, given a particular port saturating group (by default just a hydrogen)''' raise NotImplementedError # unique_mono = set() diff --git a/polymerist/polymers/monomers/specification.py b/polymerist/polymers/monomers/specification.py index 0731105..8561dd4 100644 --- a/polymerist/polymers/monomers/specification.py +++ b/polymerist/polymers/monomers/specification.py @@ -98,6 +98,7 @@ def compliant_atom_query_from_re_match(match : re.Match) -> str: # CONVERSION METHODS +## DEV: add function to check whether a given SMARTS is COMPLETELY spec-compliant def compliant_mol_SMARTS(smarts : str) -> str: '''Convert generic SMARTS string into a spec-compliant one''' # initial checks diff --git a/polymerist/smileslib/__init__.py b/polymerist/smileslib/__init__.py index 647f88b..649e161 100644 --- a/polymerist/smileslib/__init__.py +++ b/polymerist/smileslib/__init__.py @@ -3,4 +3,4 @@ __author__ = 'Timotej Bernat' __email__ = 'timotej.bernat@colorado.edu' -from .primitives import is_valid_SMILES, is_valid_SMARTS \ No newline at end of file +from .primitives import is_valid_SMILES, is_valid_SMARTS, Smiles, Smarts \ No newline at end of file diff --git a/polymerist/smileslib/primitives.py b/polymerist/smileslib/primitives.py index 0134f3c..91c87a6 100644 --- a/polymerist/smileslib/primitives.py +++ b/polymerist/smileslib/primitives.py @@ -3,10 +3,24 @@ __author__ = 'Timotej Bernat' __email__ = 'timotej.bernat@colorado.edu' +from typing import TypeAlias + from rdkit import Chem from rdkit.Chem.rdchem import BondType +# VALIDATION +Smiles : TypeAlias = str # purely for improving self-documentation of functions, no benefit to static type-checkers +Smarts : TypeAlias = str # purely for improving self-documentation of functions, no benefit to static type-checkers + +def is_valid_SMARTS(smarts : Smarts) -> bool: + '''Check if SMARTS string is valid (according to RDKit)''' + return (Chem.MolFromSmarts(smarts) is not None) + +def is_valid_SMILES(smiles : Smiles) -> bool: + '''Check if SMARTS string is valid (according to RDKit)''' + return (Chem.MolFromSmiles(smiles) is not None) + # BOND PRIMITIVES AND RELATED OBJECTS BOND_PRIMITIVES = '~-=#$:' BOND_PRIMITIVES_FOR_REGEX = r'[~\-=#$:]' # any of the SMARTS bond primitive chars, with a space to differentiate single-bond hyphen for the regex range char @@ -36,13 +50,3 @@ bonds_by_order[order] = prim_str rdbonds_by_type[bondtype] = rd_bond rdbonds_by_order[order] = rd_bond - - -# VALIDATION -def is_valid_SMARTS(smarts : str) -> bool: - '''Check if SMARTS string is valid (according to RDKit)''' - return (Chem.MolFromSmarts(smarts) is not None) - -def is_valid_SMILES(smiles : str) -> bool: - '''Check if SMARTS string is valid (according to RDKit)''' - return (Chem.MolFromSmiles(smiles) is not None) \ No newline at end of file diff --git a/polymerist/tests/genutils/textual/test_substrings.py b/polymerist/tests/genutils/textual/test_substrings.py new file mode 100644 index 0000000..1130812 --- /dev/null +++ b/polymerist/tests/genutils/textual/test_substrings.py @@ -0,0 +1,66 @@ +'''Unit tests for `substrings` package''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +import pytest + +from polymerist.genutils.textual.substrings import unique_string, shortest_repeating_substring, repeat_string_to_length + + +@pytest.mark.parametrize('string', ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit']) +def test_unique_str_unordered(string : str) -> None: + '''Test that unique characters are coorectly identified WITHOUT respect to order''' + assert set(unique_string(string, preserve_order=False)) == set(string) + +@pytest.mark.parametrize('string, expected_output', + [ + ('aaaaa', 'a'), + ('BABAB', 'BA'), + ('balaclava', 'balcv'), + ('catamaran', 'catmrn'), + ('unique', 'uniqe'), # self-reference makes everything better :P + ('singular', 'singular'), # test string with aready-unique characters are unaffected + ] +) +def test_unique_str_ordered(string : str, expected_output : str) -> None: + '''Test that unique characters are coorectly identified WITH respect to order''' + assert unique_string(string, preserve_order=True) == expected_output + + +@pytest.mark.parametrize('string, expected_output', + [ + ('aaaaa', 'a'), + ('booboo', 'boo'), + ('piripiri', 'piri'), + ('ababab', 'ab'), + ('bcbabcbabcba', 'bcba'), + # sequences which do not repeat a whole-number of times + ('no repeats', 'no repeats'), + ('ababa', 'ababa'), + ('bonobo', 'bonobo'), + ] +) +def test_shortest_repeating_substring(string : str, expected_output : str) -> None: + '''Test that minimal repeating substrings are correctly identified''' + assert shortest_repeating_substring(string) == expected_output + + +@pytest.mark.parametrize('string, target_length, joiner, expected_output', + [ + ('BACA', 10, '', 'BACABACABA'), # expected "standard" use case + ('BACA', 1, '', 'B'), # test case where target length is shorter than the whole string + ('BACA', 0, '', ''), # test that no repeats yields the empty string + ('BACA', 4, '', 'BACA'), # test precisely one repeat without joins + ('BACA', 10, '|', 'BACA|BACA|BA'), # test joiners + ('BACA', 4, '|', 'BACA'), # test no joiners are added when exactly one string repeat occurs + ('BACA', 12, '|', 'BACA|BACA|BACA'), # test no extraneous joiners are included for purely-whole number of repeats + ('CAT', 5, '', 'CATCA'), # test with triads (and different base string) + pytest.param('' , 7, '', None, marks=pytest.mark.xfail(raises=ValueError, reason='Empty string can\'t be repeated into nonempty string', strict=True)), + pytest.param('CAT', 4.2, '', None, marks=pytest.mark.xfail(raises=TypeError , reason='Non-integer string length doesn\'t make sense', strict=True)), + pytest.param('CAT', -1, '', None, marks=pytest.mark.xfail(raises=IndexError, reason='Can\'t have string with fewer than 0 characters', strict=True)), + ] +) +def test_repeat_string_to_length(string : str, target_length : int, joiner : str, expected_output : str) -> None: + '''Test that string repetition to a given length returns the expected string WITH joingin characters present''' + assert repeat_string_to_length(string, target_length=target_length, joiner=joiner) == expected_output \ No newline at end of file diff --git a/polymerist/tests/molfiles/__init__.py b/polymerist/tests/molfiles/__init__.py new file mode 100644 index 0000000..d92450c --- /dev/null +++ b/polymerist/tests/molfiles/__init__.py @@ -0,0 +1,4 @@ +'''Unit tests for `molfiles` package''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' diff --git a/polymerist/tests/molfiles/test_pdb.py b/polymerist/tests/molfiles/test_pdb.py new file mode 100644 index 0000000..9be9f3a --- /dev/null +++ b/polymerist/tests/molfiles/test_pdb.py @@ -0,0 +1,62 @@ +'''Unit tests for PDB file I/O utils''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +import pytest +from polymerist.molfiles.pdb import SerialAtomLabeller + + +ELEMS : tuple[str] = ('C', 'H', 'H', 'H', 'N', 'H', 'C', 'O', 'Cl') # atoms for methylcarbamoyl chloride (MCC) + +@pytest.mark.parametrize( + 'mol_atom_elems, atom_label_width, include_elem_idx, default_elem_idx, expected_labels', + [ + (ELEMS, 4, True, 0, ['C000', 'H000', 'H001', 'H002', 'N000', 'H003', 'C001', 'O000', 'Cl00']), # test with default PDD-compatible settings + (ELEMS, 4, True, 2, ['C002', 'H002', 'H003', 'H004', 'N002', 'H005', 'C003', 'O002', 'Cl02']), # test element index offset + (ELEMS, 3, True, 2, ['C02', 'H02', 'H03', 'H04', 'N02', 'H05', 'C03', 'O02', 'Cl2']), # test shorter atom label width + (ELEMS, 1, True, 0, ['C', 'H', 'H', 'H', 'N', 'H', 'C', 'O', 'C']), # test truncation works below threshold where indices can be written + (ELEMS, 4, False, 0, ['C ', 'H ', 'H ', 'H ', 'N ', 'H ', 'C ', 'O ', 'Cl ']), # test without element indices + (ELEMS, 4, False, 7, ['C ', 'H ', 'H ', 'H ', 'N ', 'H ', 'C ', 'O ', 'Cl ']), # test that default indices has no impact when indices aren't present + (ELEMS, 0, False, 0, ['', '', '', '', '', '', '', '', '']), # test null-width labels + # Invalid input handling checks + pytest.param( + ELEMS, -1, True, 0, [], # test that negative label width is rejected as intended + marks=pytest.mark.xfail( + raises=ValueError, + reason='Negative atom label widths not allowed', + strict=True, + ) + ), + pytest.param( + ELEMS, 4, True, -5, [], # test that negative default indices are rejected as intended + marks=pytest.mark.xfail( + raises=ValueError, + reason='Negative element indices not allowed', + strict=True, + ) + ), + pytest.param( + tuple(len(elem) for elem in ELEMS), 4, True, 0, [], # test that negative default indices are rejected as intended + marks=pytest.mark.xfail( + raises=TypeError, + reason='Must pass atom elements as strings', + strict=True, + ) + ), + ] +) +def test_atom_labeller( + mol_atom_elems : tuple[str], + atom_label_width : int, + include_elem_idx : bool, + default_elem_idx : int, + expected_labels : list[str], + ) -> None: + '''Test that atom labelling hebaves as expected with various label formatting configurations''' + labeller = SerialAtomLabeller( + atom_label_width=atom_label_width, + include_elem_idx=include_elem_idx, + default_elem_idx=default_elem_idx, + ) + assert [labeller.get_atom_label(elem) for elem in mol_atom_elems] == expected_labels \ No newline at end of file diff --git a/polymerist/tests/polymers/building/__init__.py b/polymerist/tests/polymers/building/__init__.py new file mode 100644 index 0000000..f4f43b4 --- /dev/null +++ b/polymerist/tests/polymers/building/__init__.py @@ -0,0 +1,4 @@ +'''Unit tests for `building` package''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' \ No newline at end of file diff --git a/polymerist/tests/polymers/building/test_linear.py b/polymerist/tests/polymers/building/test_linear.py new file mode 100644 index 0000000..2c086a9 --- /dev/null +++ b/polymerist/tests/polymers/building/test_linear.py @@ -0,0 +1,135 @@ +'''Tests construction of structures for linear copolymers (and relevant subfamilies, e.g. homopolymers)''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +import pytest + +from collections import Counter + +from polymerist.polymers.monomers.repr import MonomerGroup +from polymerist.polymers.monomers.fragments import PE_FRAGMENTS, MPD_TMC_FRAGMENTS, PEG_PLGA_FRAGMENTS + +from polymerist.polymers.building import build_linear_polymer +from polymerist.polymers.exceptions import MorphologyError, PartialBlockSequence, EmptyBlockSequence + + +@pytest.fixture(scope='function') +def monogrp_polyethylene() -> MonomerGroup: + return MonomerGroup(monomers=PE_FRAGMENTS) + +@pytest.fixture(scope='function') +def monogrp_mpd_tmc() -> MonomerGroup: + return MonomerGroup(monomers=MPD_TMC_FRAGMENTS) + +@pytest.fixture(scope='function') +def monogrp_peg_plga() -> MonomerGroup: + return MonomerGroup(monomers=PEG_PLGA_FRAGMENTS) + + +@pytest.mark.parametrize( + 'monomers, term_orient, n_monomers, sequence, minimize_sequence, allow_partial_sequences, energy_minimize', + [ + # Polyethylene + ('monogrp_polyethylene', {}, 7, 'A', True, True, False), # test end group autogen (should only have 1 term group) + ('monogrp_polyethylene', {'head':'PE1', 'tail' : 'PE1'}, 7, 'A', True, True, False), # test explicit head-tail (result here should be different from autogen structure) + ('monogrp_polyethylene', {'head':'PE1', 'tail' : 'PE1'}, 7, 'A', True, False, False), # test partial sequences (irrelevant here) + ('monogrp_polyethylene', {'head':'PE1', 'tail' : 'PE1'}, 7, 'A', True, False, True), # test energy minimization doesn't crash + pytest.param( # will fail due to too few monomers for given sequence - + 'monogrp_polyethylene', {}, 7, '', True, True, False, # NOTE: need to have partials enabled, since failure happens ONLY once sequence is passed to mbuild + marks=pytest.mark.xfail( + raises=EmptyBlockSequence, + reason='Sequence provided must be nonempty', + strict=True, + ) + ), + pytest.param( # will fail due to too few monomers for given sequence - + 'monogrp_polyethylene', {'head':'PE1', 'tail' : 'PE1'}, 7, 'AB', True, True, False, # NOTE: need to have partials enabled, since failure happens ONLY once sequence is passed to mbuild + marks=pytest.mark.xfail( + raises=ValueError, + reason='Fewer unique monomers defined than called for by target sequence', + strict=True, + ) + ), + # MPD-TMC + ('monogrp_mpd_tmc', {'head':'MPD-1', 'tail' : 'TMC-1'}, 8, 'A', True, True, False), # correctly-specified: explicit end groups, only linear middle monomers, and whole number of sequence repeats + pytest.param( + 'monogrp_mpd_tmc', {'head':'MPD-1', 'tail' : 'TMC-1'}, 7, 'AB', True, False, False, # will fail due to partial sequence + marks=pytest.mark.xfail( + raises=PartialBlockSequence, + reason='Partial sequence repeat needed to get odd number block out of AB, but partial blocks are disabled', + strict=True, + ) + ), + pytest.param( + 'monogrp_mpd_tmc', {'head':'MPD-1', 'tail' : 'TMC-1'}, 12, 'ABC', True, True, False, # will fail due to 3-functional TMC middle monomer as C + marks=pytest.mark.xfail( + raises=MorphologyError, + reason='One of the monomers requested is non-linear (3-functional)', + strict=True, + ) + ), + # PEG-PLGA + ('monogrp_peg_plga', {}, 15, 'ABC', True, True, False), # test autogen + ('monogrp_peg_plga', {}, 17, 'ABC', True, False, False), # test autogen with whole sequence + ('monogrp_peg_plga', {'head':'PGA-1A', 'tail' : 'PGA-1B'}, 15, 'ABC', True, True, False), # test more complex sequence with non-default explicit end groups + pytest.param( + 'monogrp_peg_plga', {'head':'PGA-1A', 'tail' : 'PGA-1B'}, 15, 'ABC', True, False, False, # will fail due to partial sequence + marks=pytest.mark.xfail( + raises=PartialBlockSequence, + reason='Partial sequence repeat needed to get odd number block out of AB, but partial blocks are disabled', + strict=True, + ) + ), + ('monogrp_peg_plga', {}, 40, 'ABCB', True, True, True), # test longer energy min + ] +) +def test_build_linear_polymer( + monomers : MonomerGroup, + term_orient : dict[str, str], + n_monomers : int, + sequence : str, + minimize_sequence : bool, + allow_partial_sequences : bool, + energy_minimize : bool, + request : pytest.FixtureRequest, # allows for fixture expansion in parameterized arguments + ) -> None: + '''Test linear polymer builder behavior under varing sets of parameters''' + monomers = request.getfixturevalue(monomers) # unpack fixtures into their respective values + monomers.term_orient = term_orient # this edit makes it VITAL that fixtures be function-level + + polymer = build_linear_polymer( + monomers=monomers, + n_monomers=n_monomers, + sequence=sequence, + minimize_sequence=minimize_sequence, + allow_partial_sequences=allow_partial_sequences, + add_Hs=False, + energy_minimize=energy_minimize, + ) + + # characterize middle monomers + n_rep_units = len(polymer.children) + residue_sizes : dict[str, int] = {} + residue_counts = Counter() # TODO: make use of this for checks!! + for middle_monomers in polymer.children: + residue_sizes[middle_monomers.name] = middle_monomers.n_particles + residue_counts[middle_monomers.name] += 1 + + # characterize end groups + end_groups_requested = set(resname for head_or_tail, (resname, mol) in monomers.linear_end_groups().items()) + end_groups_used = set() + for end_group in polymer.end_groups: + if end_group is not None: + end_groups_used.add(end_group.name) + residue_sizes[end_group.name] = end_group.n_particles + residue_counts[middle_monomers.name] += 1 + + total_reps_match = (n_rep_units == n_monomers) + contribs_match = all(num_monomers == monomers.contributions()[resname][0] + for resname, num_monomers in residue_sizes.items() + ) + end_groups_correct = (end_groups_used == end_groups_requested) + # counts_match = ... + + assert all([total_reps_match, contribs_match, end_groups_correct]) #, and counts_match ) \ No newline at end of file diff --git a/polymerist/tests/polymers/building/test_sequencing.py b/polymerist/tests/polymers/building/test_sequencing.py new file mode 100644 index 0000000..272fd4d --- /dev/null +++ b/polymerist/tests/polymers/building/test_sequencing.py @@ -0,0 +1,123 @@ +'''Testing that copolymer sequencing scales (and fails) as expected''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +from typing import Any +from dataclasses import asdict + +import pytest +from pathlib import Path + +from polymerist.polymers.building.sequencing import LinearCopolymerSequencer as LCS +from polymerist.polymers.exceptions import EmptyBlockSequence, PartialBlockSequence, InsufficientChainLength, EndGroupDominatedChain + + +@pytest.fixture +def sequencer() -> LCS: + '''A sample sequencer with known, valid inputs''' + return LCS(sequence_kernel='ABAB', n_repeat_units=14, n_repeat_units_terminal=2) + +@pytest.mark.parametrize( + 'inputs', + [ + { + 'sequence_kernel' : 'AB', + 'n_repeat_units' : 10, + 'n_repeat_units_terminal' : 1 + }, + pytest.param( + { + 'sequence_kernel' : 'BAC', + 'n_repeat_units' : 1, + 'n_repeat_units_terminal' : 2 + }, + marks=pytest.mark.xfail( + raises=EndGroupDominatedChain, + reason='Results in (unsatisfiable) negative number of middle monomers', + strict=True, + ) + ), + pytest.param( + { + 'sequence_kernel' : '', + 'n_repeat_units' : 7, + 'n_repeat_units_terminal' : 1 + }, + marks=pytest.mark.xfail( + raises=EmptyBlockSequence, + reason='No sequence kernel provided', + strict=True, + ) + ), + ] +) +def test_LCS_input_validation(inputs : dict[str, Any]) -> None: + '''Test that invalid Sequence input are correctly rejected''' + _ = LCS(**inputs) # no assert needed, just checking when initialization completes + +def test_LCS_copying(sequencer : LCS) -> None: + '''Test that sequencers are properly copied in a read-only manner''' + sequencer_clone = sequencer.copy() + + # tamper with the parameters of the copy in a way that guarantees distinctness + sequencer_clone.sequence_kernel = 2*sequencer.sequence_kernel + sequencer_clone.n_repeat_units += 2 + sequencer_clone.n_repeat_units_terminal += 1 + + # check that the original WASN'T tampered with + assert asdict(sequencer) != asdict(sequencer_clone) + + +@pytest.mark.parametrize( + 'sequencer, expected_kernel', + [ + (LCS('ABC', n_repeat_units=12), 'ABC') , # test irrreducible case + (LCS('ABAB', n_repeat_units=12), 'AB'), # test unreduced case + ] +) +def test_LCS_reduction(sequencer : LCS, expected_kernel : str) -> None: + '''Test that shortest repeating subsequences of sequencer kernels are correctly identified''' + sequencer.reduce() + assert sequencer.sequence_kernel == expected_kernel + +@pytest.mark.parametrize( + 'sequencer, allow_partials, expected_sequence, expected_length', + [ + # tests for homopolymers + (LCS('A', 5, 1), True , 'A', 4), + (LCS('A', 5, 1), False, 'A', 4), # partial block single-monomer sequence will never exist, so "allow_partial_sequences" setting shouldn't matter) + pytest.param( + LCS('A', 1, 1), True, 'A', 1, # test that all-end group (i.e. no middle monomer) case is correctly rejected + marks=pytest.mark.xfail( + raises=InsufficientChainLength, + reason='No middle monomers can be accomodated', + strict=True, + ), + ), + # tests for "true" copolymers + (LCS('ABC', 10, 2), True, 'ABCABCAB', 1), + pytest.param( + LCS('ABC', 10, 2), False, 'ABCABCAB', 1, # test that partial-sequence ban correctly blocks partial sequences... + marks=pytest.mark.xfail( + raises=PartialBlockSequence, + reason='Partial sequence repeats have not been allowed', + strict=True, + ), + ), + (LCS('ABC', 11, 2), False, 'ABC', 3), # ...unless the resulting sequence happens to be a whole multiple + pytest.param( + LCS('ABC', 2, 2), True, '', 1, # test that all-end group (i.e. no middle monomer) case is correctly rejected... + marks=pytest.mark.xfail( + raises=InsufficientChainLength, + reason='No middle monomers can be accomodated', + strict=True, + ), + ), + (LCS('ABC', 4, 2), True, 'AB', 1), # ... and finally, check that nonempty sequences SMALLER than the kernel are also recognized if partials are permitted + ] +) +def test_LCS_procrustean_alignment(sequencer : LCS, allow_partials : bool, expected_sequence : str, expected_length : int) -> None: + '''Test capability (and prechecks) for fitting sequence to target chain length''' + seq, n_reps = sequencer.procrustean_alignment(allow_partial_sequences=allow_partials) + assert (seq == expected_sequence) and (n_reps == expected_length) \ No newline at end of file diff --git a/polymerist/tests/polymers/monomers/test_repr.py b/polymerist/tests/polymers/monomers/test_repr.py new file mode 100644 index 0000000..bf93cdf --- /dev/null +++ b/polymerist/tests/polymers/monomers/test_repr.py @@ -0,0 +1,192 @@ +'''Tests that collections of monomer fragments are treated as expected''' + +__author__ = 'Timotej Bernat' +__email__ = 'timotej.bernat@colorado.edu' + +from typing import Any + +import pytest + +from polymerist.polymers.monomers.repr import MonomerGroup +from polymerist.polymers.monomers.fragments import PE_FRAGMENTS, MPD_TMC_FRAGMENTS, PEG_PLGA_FRAGMENTS + + +# Example fragments groups +@pytest.fixture(scope='function') # want to re-initialize for each test function to avoid cross-contamination +def monogrp_degenerate() -> MonomerGroup: + return MonomerGroup(monomers={}) + +@pytest.fixture(scope='function') +def monogrp_polyethylene() -> MonomerGroup: + return MonomerGroup(monomers=PE_FRAGMENTS) + +@pytest.fixture(scope='function') +def monogrp_mpd_tmc() -> MonomerGroup: + return MonomerGroup(monomers=MPD_TMC_FRAGMENTS) + +@pytest.fixture(scope='function') +def monogrp_peg_plga() -> MonomerGroup: + return MonomerGroup(monomers=PEG_PLGA_FRAGMENTS) + + +# Testing all routes to initialization +@pytest.mark.parametrize( + 'monomers', + [ + { # nominal test case + 'PGA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]'], + 'PGA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]'], + 'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'], + }, + { # test that list closure autofill works + 'PGA-1A': '[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]', + 'PGA-1B': '[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]', + 'PGA-2': '[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]', + }, + # XFAILS: test that the initializer rejects... + pytest.param( + { # ...1) non-string like objects + 'foo' : 42.0, + 'bar' : True, + }, + marks=pytest.mark.xfail( + raises=TypeError, + reason='Monomer fragment inputs are not stringlike', + strict=True, + ), + ), + pytest.param( + { # 1a) more subtly, list OF CONTAINERS of valid SMARTS are still invalid + 'PGA-1A': [( + '[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]', + '[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]' + )], + 'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'], + }, + marks=pytest.mark.xfail( + raises=TypeError, + reason='Monomer fragment inputs are not stringlike', + strict=True, + ), + ), + pytest.param( + { # ...3) non-empty strings which are nevertheless invalid SMARTS + #- NOTE: empty strings, perhaps surprisingly, actually ARE valid as SMARTS and therefore aren't xfail tested here + 'fake-1': ['this is a bogus SMARTS'], + 'invalid-2': ['so_is_this'], + }, + marks=pytest.mark.xfail( + raises=ValueError, + reason='At least one monomer fragment input is not valid a SMARTS string', + strict=True, + ), + ), + pytest.param( + { # ...3a) this one is very subtle, but SMARTS with slight errors which invalidate them as SMARTS should also fail + 'PGA-1A': ['[OH]CD(=O)*'], # fat-finger mistake, "D" should be "C" + 'PGA-2': ['*OCC(+O)*'], # forgot to hit shift when typing double bond + }, + marks=pytest.mark.xfail( + raises=ValueError, + reason='At least one monomer fragment input is not valid a SMARTS string', + strict=True, + ), + ), + ] +) +def test_monogrp_init(monomers : dict[str, Any]) -> None: + '''Check that the MonomerGroup initializer handles valid inputs as expected and reject invalid inputs''' + _ = MonomerGroup(monomers=monomers) # no assert needed, just checking when initialization completes + +# Testing properties of contained monomers +@pytest.mark.parametrize( + 'monogrp, expected_is_linear', + [ + ('monogrp_peg_plga', True), + ('monogrp_mpd_tmc', False), + ], +) +def test_monogrp_linearity(monogrp : MonomerGroup, expected_is_linear : bool, request : pytest.FixtureRequest) -> None: + '''Test whether branched and unbranched chain fragment detection behaves as expected''' + monogrp = request.getfixturevalue(monogrp) # unpack fixtures into their respective values + assert monogrp.is_linear == expected_is_linear + +@pytest.mark.parametrize( + 'monogrp, expected_counts', + [ + ('monogrp_peg_plga', (3, 6)), + ('monogrp_mpd_tmc', (3, 2)), + ], +) +def test_monogrp_mid_and_term_counts(monogrp : MonomerGroup, expected_counts : tuple[int, int], request : pytest.FixtureRequest) -> None: + '''Test whether middle and terminal monomers are counted correctly''' + monogrp = request.getfixturevalue(monogrp) # unpack fixtures into their respective values + assert monogrp.num_mid_and_term == expected_counts + +# Testing end group determination +@pytest.mark.parametrize( + 'monogrp, term_orient, expected_end_groups', + [ + # 1) test autogeneration of orientations when... + ( # ...term orientation is unspecified but can be completed for both ends (i.e. at least 2 terminal monomers are available) + 'monogrp_peg_plga', + {}, + {'head' : 'PEG-1A', 'tail' : 'PEG-1B'}, + ), + ( # ...term orientation is unspecified and can only be partially completed (i.e. fewer than 2 terminal monomers are available) + 'monogrp_polyethylene', + {}, + {'head' : 'PE1'}, + ), + ( # ...term orientation is unspecified and no end monomers are available for auto-assignment + 'monogrp_degenerate', + {}, + {}, + ), + ( # ...term orientation is unspecified but can be completed for both ends (i.e. at least 2 terminal monomers are available) + 'monogrp_peg_plga', + {}, + {'head' : 'PEG-1A', 'tail' : 'PEG-1B'}, + ), + # 2) test end group identification for correctly-specified term orientation + ( # test nominal case + 'monogrp_peg_plga', + {'head' : 'PGA-1A', 'tail' : 'PEG-1B'}, + {'head' : 'PGA-1A', 'tail' : 'PEG-1B'}, + ), + ( # test that duplication works as expected + 'monogrp_polyethylene', + {'head' : 'PE1', 'tail' : 'PE1'}, + {'head' : 'PE1', 'tail' : 'PE1'}, + ), + # 3) test incorrect specifications + ( # specification without "head"/"tail" keys will not fail, but WILL default to auto-gen + 'monogrp_peg_plga', + {'first' : 'PGA-1A', 'second' : 'PEG-1B'}, + {'head' : 'PEG-1A', 'tail' : 'PEG-1B'}, + ), + pytest.param( # specification with invalid monomer names (i.e. keys not in the "monomers" dict) should raise outright error + 'monogrp_peg_plga', + {'head' : 'PGG-2C', 'tail' : 'BOGUS'}, + None, + marks=pytest.mark.xfail( + raises=KeyError, + reason='Term group names specified don;t existing within the monomer fragments defined', + strict=True, + ) + ), + ], +) +def test_monogrp_end_groups(monogrp : MonomerGroup, term_orient : dict[str, str], expected_end_groups : dict[str, str], request : pytest.FixtureRequest) -> None: + '''Test whether procedural end group determination''' + monogrp = request.getfixturevalue(monogrp) # unpack fixtures into their respective values + monogrp.term_orient = term_orient + + end_group_catalogue = monogrp.linear_end_groups() + end_group_names = { + head_or_tail : resname # drop RDKit Mol for check (Mol object is harder to validate, use bound name as proxy) + for head_or_tail, (resname, _) in end_group_catalogue.items() + } + + assert end_group_names == expected_end_groups + \ No newline at end of file