timbernat · timbernat · Dec 12, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/polymerist/genutils/importutils/dependencies.py b/polymerist/genutils/importutils/dependencies.py
@@ -3,7 +3,7 @@
 __author__ = 'Timotej Bernat'
 __email__ = '[email protected]'
 
-from typing import Callable, ParamSpec, TypeVar
+from typing import Callable, Optional, ParamSpec, TypeVar
 
 Params = ParamSpec('Params')
 ReturnType = TypeVar('ReturnType')
@@ -14,6 +14,25 @@
 from functools import wraps
 
 
+class MissingPrerequisitePackage(Exception):
+    '''Raised when a package dependency cannot be found and the user should be alerted with install instructions'''
+    def __init__(self,
+            importing_package_name : str,
+            use_case : str,
+            install_link : str,
+            dependency_name : str,
+            dependency_name_formal : Optional[str]=None
+        ):
+        if dependency_name_formal is None:
+            dependency_name_formal = dependency_name
+
+        message = f'''
+        {use_case.capitalize()} require(s) {dependency_name_formal}, which was not found in the current environment
+        Please install `{dependency_name}` by following the installation instructions at {install_link}
+        Then try importing from "{importing_package_name}" again'''
+
+        super().__init__(message)
+
 def module_installed(module_name : str) -> bool:
     '''
     Check whether a module of the given name is present on the system

diff --git a/polymerist/genutils/textual/strsearch.py b/polymerist/genutils/textual/strsearch.py
diff --git a/polymerist/genutils/textual/substrings.py b/polymerist/genutils/textual/substrings.py
@@ -0,0 +1,81 @@
+'''For identifying and concatenating substrings of other strings with unique properties'''
+
+__author__ = 'Timotej Bernat'
+__email__ = '[email protected]'
+
+
+def unique_string(string : str, preserve_order : bool=True) -> str:
+    '''
+    Accepts a string and returns another string containing
+    only the UNIQUE characters in the origin string
+
+    Can specify whether order is important with the "preserve_order" keyword
+
+    Parameters
+    ----------
+    string : str
+        An arbitrary string on wants the unique characters from
+    preserve_order : bool, default True
+        Whether or not to keep the unique characters in the order they are found
+        For example: 
+            unique_string("balaclava", preserve_order=False) -> "bcavl"
+            unique_string("balaclava", preserve_order=True) -> "balcv"
+
+    Returns
+    -------
+    uniquified_str : str
+        Another string containing only the unique characters in "string"
+        Order depends on the value of the "preserve_order" parameter
+    '''
+    if not preserve_order:
+        unique_chars = set(string)
+    else:
+        unique_chars = []
+        for char in string:
+            if char not in unique_chars:
+                unique_chars.append(char)
+
+    return ''.join(unique_chars)
+
+def shortest_repeating_substring(string : str) -> str:
+    '''Return the shortest substring such that the passed string can be written as some number of repeats (including 1) of the substring
+    Will return the original string if no simpler decomposition exists'''
+    i = (2*string).find(string, 1, -1) # check if string matches itself in a cycle in non-trivial way (i.e more than just the two repeats)
+    return string if (i == -1) else string[:i]
+
+def repeat_string_to_length(string : str, target_length : int, joiner : str='') -> str:
+    '''
+    Takes a string and repeats it cyclically to produce another string of a given length
+    The number of times the original string occurs in the new string may be fractional
+    for example:
+    >> repeat_string_to_length("CAT", 6) -> "CATCAT"
+    >> repeat_string_to_length("BACA", 10) -> "BACABACABA"
+
+    Parameters
+    ----------
+    string : str
+        An arbitrary string to repeat
+    target_length : int
+        The length of the final desired string
+        This does NOT have to be an integer multiple of the length of "string"
+            E.g. repeat_string_to_length("BACA", 10) -> "BACABACABA"
+        Nor does it have to be greater than the length of "string"
+            E.g. repeat_string_to_length("BACA", 3) -> "BAC"
+
+    Returns
+    -------
+    rep_string : str
+        A new string which has the desired target length and consists of cycles of the initial string
+    '''
+    if not string:
+        raise ValueError(f'Cannot generate nonempty string from any amount of repeats of the empty string')
+    if not isinstance(target_length, int):
+        raise TypeError(f'Only integer target string lengths are allowed, not non-integer type "{type(target_length).__name__}"')
+    if target_length < 0:
+        raise IndexError(f'Cannot generate a string of negative length (requested length of {target_length} character(s))')
+
+    num_str_reps, num_extra_chars = divmod(target_length, len(string))
+    remainder = (string[:num_extra_chars],) if num_extra_chars else () # empty container avoids extra joiner at end when remainder string is empty
+
+    return joiner.join(num_str_reps*(string,) + remainder) # tuples here are ~2 OOM faster than moral equivalent with lists
+
diff --git a/polymerist/mdtools/openfftools/__init__.py b/polymerist/mdtools/openfftools/__init__.py
@@ -4,13 +4,14 @@
 __email__ = '[email protected]'
 
 # Subpackage-wide precheck to see if OpenFF is even usable in the first place
-from ...genutils.importutils.dependencies import modules_installed
+from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage
 if not modules_installed('openff', 'openff.toolkit'):
-    raise ModuleNotFoundError(
-        f'''
-        OpenFF packages which are required to utilitize {__name__} not found in current environment
-        Please follow installation instructions at https://docs.openforcefield.org/projects/toolkit/en/stable/installation.html, then retry import
-        '''
+    raise MissingPrerequisitePackage(
+        importing_package_name=__spec__.name,
+        use_case='OpenFF addons',
+        install_link='https://docs.openforcefield.org/projects/toolkit/en/stable/installation.html',
+        dependency_name='openff-toolkit',
+        dependency_name_formal='the OpenFF software stack',
     )
 
 # Import of toplevel OpenFF object registries

diff --git a/polymerist/mdtools/openfftools/boxvectors.py b/polymerist/mdtools/openfftools/boxvectors.py
@@ -14,7 +14,7 @@
 from openff.toolkit import Topology
 from openff.interchange.components._packmol import _box_vectors_are_in_reduced_form
 
-from .omminter.unitsys import allow_openmm_units, openff_to_openmm
+from .unitsys import allow_openmm_units, openff_to_openmm
 
 
 # CUSTOM TYPES FOR CLARITY, ESPECIALLY WITH UNITS

diff --git a/polymerist/mdtools/openfftools/omminter/__init__.py b/polymerist/mdtools/openfftools/omminter/__init__.py
@@ -4,9 +4,3 @@
 __email__ = '[email protected]'
 
 from .mdobjects import forcefield_flexible, openff_topology_to_openmm
-from .unitsys import (
-    openmm_to_openff,
-    openff_to_openmm, 
-    allow_openmm_units,
-    allow_openff_units,
-)
diff --git a/polymerist/mdtools/openfftools/omminter/mdobjects.py b/polymerist/mdtools/openfftools/omminter/mdobjects.py
@@ -15,7 +15,7 @@
 from openmm.app import Topology as OMMTopology
 from openmm.unit import Quantity
 
-from .unitsys import openff_to_openmm
+from ..unitsys import openff_to_openmm
 from .. import FFDIR
 from ..boxvectors import box_vectors_flexible, VectorQuantity, BoxVectorsQuantity
 
@@ -39,8 +39,13 @@ def forcefield_flexible(forcefield : Union[ForceField, str, Path]) -> ForceField
 
         return ForceField(ff_path)
 
-def openff_topology_to_openmm(offtop : OFFTopology, forcefield : Union[ForceField, str, Path], box_vecs : Optional[Union[VectorQuantity, BoxVectorsQuantity]]=None,
-                              combine_nonbonded_forces : bool=False, add_constrained_forces : bool=False) -> tuple[OMMTopology, System, Quantity]:
+def openff_topology_to_openmm(
+            offtop : OFFTopology,
+            forcefield : Union[ForceField, str, Path],
+            box_vecs : Optional[Union[VectorQuantity, BoxVectorsQuantity]]=None,
+            combine_nonbonded_forces : bool=False,
+            add_constrained_forces : bool=False
+        ) -> tuple[OMMTopology, System, Quantity]:
     '''Converts an OpenFF Topology to an OpenMM Topology, System, and Positions'''
     if box_vecs is not None:
         offtop.box_vectors = box_vectors_flexible(box_vecs)

diff --git a/polymerist/mdtools/openfftools/solvation/physprops.py b/polymerist/mdtools/openfftools/solvation/physprops.py
@@ -15,7 +15,7 @@
 from openff.units import Quantity as OFFQuantity
 
 from ....unitutils.dimensions import is_volume
-from ..omminter.unitsys import allow_openff_units, openff_to_openmm
+from ..unitsys import allow_openff_units, openff_to_openmm
 
 
 # MASS

diff --git a/polymerist/mdtools/openfftools/solvation/solvents/__init__.py b/polymerist/mdtools/openfftools/solvation/solvents/__init__.py
@@ -10,7 +10,6 @@
 from openff.units import unit as offunit
 
 from ... import topology
-from ... import TKREGS
 
 
 def generate_water_TIP3P() -> Molecule:

diff --git a/...t/mdtools/openfftools/omminter/unitsys.py → polymerist/mdtools/openfftools/unitsys.py b/...t/mdtools/openfftools/omminter/unitsys.py → polymerist/mdtools/openfftools/unitsys.py
diff --git a/polymerist/mdtools/openmmtools/serialization.py b/polymerist/mdtools/openmmtools/serialization.py
@@ -23,6 +23,7 @@
 from ...genutils.fileutils.pathutils import assemble_path
 from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable
 from ...genutils.fileutils.jsonio.serialize import PathSerializer
+from ...molfiles.pdb import SerialAtomLabeller
 
 
 # DEFINING AND STORING SIMULATION PATHS
@@ -119,12 +120,18 @@ def serialize_system(sys_path : Path, system : System) -> None:
         file.write(XmlSerializer.serialize(system))
 
 @allow_string_paths
-def serialize_openmm_pdb(pdb_path : Path, topology : OpenMMTopology, positions : Union[NDArray, list[Vec3]], keep_chain_and_res_ids : bool=True,
-                         uniquify_atom_ids : bool=True, num_atom_id_digits : int=2, resname_repl : Optional[dict[str, str]]=None) -> None:
+def serialize_openmm_pdb(
+        pdb_path : Path,
+        topology : OpenMMTopology,
+        positions : Union[NDArray, list[Vec3]],
+        keep_chain_and_res_ids : bool=True,
+        atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(),
+        resname_map : Optional[dict[str, str]]=None,
+    ) -> None:
     '''Configure and write an Protein DataBank File from an OpenMM Topology and array of positions
     Provides options to configure atom ID numbering, residue numbering, and residue naming'''
-    if resname_repl is None:
-        resname_repl = {} # avoids mutable default
+    if resname_map is None:
+        resname_map = {} # avoids mutable default
 
     # chain config
     for chain in topology.chains():
@@ -133,18 +140,14 @@ def serialize_openmm_pdb(pdb_path : Path, topology : OpenMMTopology, positions :
     # residue config
     for residue in topology.residues():
         residue.id = str(residue.id) # avoids TypeError when specifying keepIds during PDB write
-        repl_res_name = resname_repl.get(residue.name, None) # lookup current residue name to see if a replacement is called for
+        repl_res_name = resname_map.get(residue.name, None) # lookup current residue name to see if a replacement is called for
         if repl_res_name is not None:
             residue.name = repl_res_name
 
     # individual atom config
-    element_counter = Counter() # for keeping track of the running index of each distinct element - could be used to produce a Hill formula
-    for atom in topology.atoms():
-        symbol = atom.element.symbol
-        atom_id = element_counter[symbol]
-        if uniquify_atom_ids:
-            atom.name = f'{symbol}{atom_id:0{num_atom_id_digits}d}' # extend atom name with ordered integer with specified number of digits (including leading zeros)
-        element_counter[symbol] += 1
+    if atom_labeller: # implicitly, preserves extant atom names if a labeller is not given
+        for atom in topology.atoms():
+            atom.name = atom_labeller.get_atom_label(atom.element.symbol)
 
     # file write
     with pdb_path.open('w') as file:

diff --git a/polymerist/molfiles/__init__.py b/polymerist/molfiles/__init__.py
@@ -0,0 +1,4 @@
+'''Utilities for reading from and writing to various molecular file formats'''
+
+__author__ = 'Timotej Bernat'
+__email__ = '[email protected]'
diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py
@@ -0,0 +1,75 @@
+'''PDB file formatting tools'''
+
+__author__ = 'Timotej Bernat'
+__email__ = '[email protected]'
+
+from dataclasses import dataclass, field
+from collections import Counter
+
+
+@dataclass(frozen=True)
+class SerialAtomLabeller:
+    '''
+    For assigning unique numbered atom names based on their
+    order of appearance within a molecule and elemental class
+
+    Useful, for example, in generating unique atom names for a PDB file
+
+    Parameters
+    ----------
+    atom_label_width : int , default 4      
+        Exact length alloted for any generated atom label
+        Labels shorter than this are right-padded with spaces,
+        while labels longer than this are truncated
+
+        Default of 4 is the chosen to be compatible with the PDB specification ("Atom name: lines 13-16, left-justified")
+        https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
+    include_elem_idx : bool, default True  
+        Whether to attach a numerical element-index postfix to atom labels
+
+        E.g. with atom_label_width=4, the fifth carbon in a topology  
+        will be labelled as "C004" with include_elem_idx=True, 
+        while labelled as "C   " with include_elem_idx=False, 
+    default_elem_idx : int, default 0
+        Starting index for each element category
+        By default, is 0-indexed; MUST BE POSITIVE
+    '''
+    atom_label_width : int = 4
+    include_elem_idx : bool = True
+    default_elem_idx : int = 0
+
+    element_counter : Counter = field(init=False, default_factory=Counter)
+
+    def __post_init__(self) -> None:
+        '''Check ranges on input values'''
+        if self.atom_label_width < 0:
+            raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.atom_label_width})')
+
+        if self.default_elem_idx < 0:
+            raise ValueError(f'Must provide a non-negative starting index for element indices (provided {self.default_elem_idx})')
+
+    def get_atom_label(self, elem_symbol : str) -> str:
+        '''
+        Obtain a numbered atom label for an atom based on its element, 
+        updating the underlying element context in the process
+        '''
+        if not isinstance(elem_symbol, str):
+            raise TypeError(f'Must pass symbol of atom\'s element as str (not type {type(elem_symbol).__name__})')
+
+        if elem_symbol not in self.element_counter: # initialize first occurence to starting value
+            self.element_counter[elem_symbol] = self.default_elem_idx
+
+        atom_idx_label : str = ''
+        if self.include_elem_idx:
+            atom_idx = self.element_counter[elem_symbol]
+            num_idx_digits = max(self.atom_label_width - len(elem_symbol), 0) # number of symbols left over for an atom index
+            atom_idx_label = f'{atom_idx:0{num_idx_digits}d}'
+
+        atom_name = f'{elem_symbol}{atom_idx_label}'
+        atom_name = atom_name.ljust(self.atom_label_width, ' ')[:self.atom_label_width] # pad with spaces if too short, or truncate if too long
+        assert(len(atom_name) <= self.atom_label_width) # perfunctory check to make sure things are working as expected
+
+        self.element_counter[elem_symbol] += 1 # update tally with addition of new occurence of a particular element
+
+        return atom_name
+