From 1adc0b4e242d6d3a2bec251327f7d82a7d3c3f61 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 3 Dec 2024 18:50:37 -0700
Subject: [PATCH 01/78] Added typehints for SMILES and SMARTS strings

---
 polymerist/smileslib/primitives.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/polymerist/smileslib/primitives.py b/polymerist/smileslib/primitives.py
index 0134f3c..844cad9 100644
--- a/polymerist/smileslib/primitives.py
+++ b/polymerist/smileslib/primitives.py
@@ -3,10 +3,24 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
+from typing import TypeAlias
+
 from rdkit import Chem
 from rdkit.Chem.rdchem import BondType
 
 
+# VALIDATION
+Smiles : TypeAlias = str # purely for improving self-documentation of functions, no benefit to static type-checkers
+Smarts : TypeAlias = str # purely for improving self-documentation of functions, no benefit to static type-checkers
+
+def is_valid_SMARTS(smarts : str) -> bool:
+    '''Check if SMARTS string is valid (according to RDKit)'''
+    return (Chem.MolFromSmarts(smarts) is not None)
+
+def is_valid_SMILES(smiles : str) -> bool:
+    '''Check if SMARTS string is valid (according to RDKit)'''
+    return (Chem.MolFromSmiles(smiles) is not None)
+
 # BOND PRIMITIVES AND RELATED OBJECTS
 BOND_PRIMITIVES = '~-=#$:'
 BOND_PRIMITIVES_FOR_REGEX = r'[~\-=#$:]' # any of the SMARTS bond primitive chars, with a space to differentiate single-bond hyphen for the regex range char
@@ -36,13 +50,3 @@
             bonds_by_order[order]     = prim_str
             rdbonds_by_type[bondtype] = rd_bond
             rdbonds_by_order[order]   = rd_bond
-
-
-# VALIDATION
-def is_valid_SMARTS(smarts : str) -> bool:
-    '''Check if SMARTS string is valid (according to RDKit)'''
-    return (Chem.MolFromSmarts(smarts) is not None)
-
-def is_valid_SMILES(smiles : str) -> bool:
-    '''Check if SMARTS string is valid (according to RDKit)'''
-    return (Chem.MolFromSmiles(smiles) is not None)
\ No newline at end of file

From 59199131d28e6c3f247ae3ea74e6f5769964b96e Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 3 Dec 2024 20:18:35 -0700
Subject: [PATCH 02/78] Exposed Smiles and Smarts typehints at subpackage level

---
 polymerist/smileslib/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polymerist/smileslib/__init__.py b/polymerist/smileslib/__init__.py
index 647f88b..649e161 100644
--- a/polymerist/smileslib/__init__.py
+++ b/polymerist/smileslib/__init__.py
@@ -3,4 +3,4 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
-from .primitives import is_valid_SMILES, is_valid_SMARTS
\ No newline at end of file
+from .primitives import is_valid_SMILES, is_valid_SMARTS, Smiles, Smarts
\ No newline at end of file

From d5cd8fc2c8acba5cde96ce1054c6014f98a10015 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 3 Dec 2024 20:18:56 -0700
Subject: [PATCH 03/78] Added function for uniquifying strings (which can
 preserve character order)

---
 polymerist/genutils/textual/strsearch.py | 35 ++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/polymerist/genutils/textual/strsearch.py b/polymerist/genutils/textual/strsearch.py
index 919e797..14e080a 100644
--- a/polymerist/genutils/textual/strsearch.py
+++ b/polymerist/genutils/textual/strsearch.py
@@ -6,8 +6,39 @@
 from typing import Callable, Optional
 from pathlib import Path
 
-from ..fileutils.extensions import FileTypeError
 
+def uniquify_str(string : str, preserve_order : bool=True) -> str:
+    '''
+    Accepts a string and returns another string containing
+    only the UNIQUE characters in the origin string
+    
+    Can specify whether order is important with the "preserve_order" keyword
+    
+    Parameters
+    ----------
+    string : str
+        An arbitrary string on wants the unique characters from
+    preserve_order : bool, default True
+        Whether or not to keep the unique characters in the order they are found
+        For example: 
+            uniquify_str("balaclava", preserve_order=False) -> "bcavl"
+            uniquify_str("balaclava", preserve_order=True) -> "balcv"
+        
+    Returns
+    -------
+    uniquified_str : str
+        Another string containing only the unique characters in "string"
+        Order depends on the value of the "preserve_order" parameter
+    '''
+    if not preserve_order:
+        unique_chars = set(string)
+    else:
+        unique_chars = []
+        for char in string:
+            if char not in unique_chars:
+                unique_chars.append(char)
+    
+    return ''.join(unique_chars)
 
 def shortest_repeating_substring(string : str) -> str:
     '''Return the shortest substring such that the passed string can be written as some number of repeats (including 1) of the substring
@@ -29,7 +60,7 @@ def filter_text_by_condition(in_text_path : Path, condition : Callable[[str], bo
         raise PermissionError(f'Attempting to overwrite {in_text_path} with regex filter') # prevent write clash
     
     if (out_text_path.suffix != in_text_path.suffix):  # prevent file type conversion during transfer
-        raise FileTypeError(f'Input and output file must have same extension (not {in_text_path.suffix} and {out_text_path.suffix})')
+        raise ValueError(f'Input and output file must have same extension (not {in_text_path.suffix} and {out_text_path.suffix})')
 
     with out_text_path.open('w') as outfile: 
         with in_text_path.open('r') as infile: # readfile is innermost in case error occurs during file read (caught by handler one level up)

From 9a6278e0ca76f3cf2b3b4124c2d335a7e19de2fe Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 3 Dec 2024 20:55:45 -0700
Subject: [PATCH 04/78] Updated DOP calculation to check for and yield correct
 number of monomers regardless of block sequence length

---
 polymerist/polymers/building.py | 46 +++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index e7898c3..8ccd864 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -16,15 +16,18 @@
 from pathlib import Path
 from rdkit import Chem
 
-from .exceptions import MorphologyError
+from .exceptions import InsufficientChainLengthError, MorphologyError
 from .estimation import estimate_chain_len_linear
-from ..polymers.monomers.repr import MonomerGroup
-from ..polymers.monomers.specification import SANITIZE_AS_KEKULE
 
 from ..genutils.decorators.functional import allow_string_paths
+from ..genutils.textual.strsearch import uniquify_str
+
 from ..rdutils.bonding.portlib import get_linker_ids
 from ..rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports
+
 from ..mdtools.openmmtools.serialization import serialize_openmm_pdb
+from ..polymers.monomers.repr import MonomerGroup
+from ..polymers.monomers.specification import SANITIZE_AS_KEKULE
 
 
 # CONVERSION
@@ -67,7 +70,7 @@ def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A',
     if not monomers.is_linear:
         raise MorphologyError('Linear polymer building does not support non-linear monomer input')
     
-    if monomers.has_valid_linear_term_orient:
+    if monomers.has_valid_linear_term_orient: # DEV: consider moving this logic into MonomerGroup
         term_orient = monomers.term_orient
         LOGGER.info(f'Using pre-defined terminal group orientation {term_orient}')
     else:
@@ -77,14 +80,31 @@ def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A',
         }
         LOGGER.warning(f'No valid terminal monomer orientations defined; autogenerated orientations "{term_orient}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
 
-    # 1) ADD MIDDLE MONOMERS TO CHAIN
+    # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE)
+    n_terminal = len(term_orient) # determine how many terminal monomers are actually present and well-defined
+    block_size = len(sequence)
+    
+    if ((DOP - n_terminal) % block_size) != 0:
+        raise ValueError(f'Cannot build a(n) {DOP}-monomer chain from any number of {block_size}-monomer blocks and {n_terminal} end groups')
+    # NOTE: not explicitly forcing n_seq_reps to catch lingering float input / inexact division errors
+    n_seq_reps = (DOP - n_terminal) // block_size # number of times to repeat the block sequence between end groups to reach the target chain length
+    if n_seq_reps < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced by hand
+        raise InsufficientChainLengthError(f'{DOP}-monomer chain has few total monomers to accomodate {n_terminal} end groups AND at least 1 middle monomer sequence')
+    # TODO: consider adding support for fractional sequence lengths IFF that fraction is a rational number whose denominator divides the sequence length...
+    # ...for example, could allow 5/2 * 'BACA' to be interpreted as 'BACA|BACA|BA'; 5/3 * 'BACA' would still be invalid though
+    LOGGER.info(f'Target chain length achievable with {n_seq_reps} block sequence repeat(s) ({n_seq_reps}*{block_size} [{sequence}] middle monomers + {n_terminal} terminal monomers = {DOP} total monomers)')
+
+    # 2) ADD MIDDLE MONOMERS TO CHAIN
     chain = MBPolymer() 
-    for (resname, middle_monomer), sequence_key in zip(monomers.iter_rdmols(term_only=False), sequence): # zip with sequence limits number of middle monomers to length of block sequence
+    for (resname, middle_monomer), sequence_key in zip(
+            monomers.iter_rdmols(term_only=False),
+            uniquify_str(sequence, preserve_order=True), # only register a new monomer for each appearance of a new indicator in the sequence
+        ): # zip with sequence limits number of middle monomers to length of block sequence
         LOGGER.info(f'Registering middle monomer {resname} (block identifier "{sequence_key}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer)
         chain.add_monomer(compound=mb_monomer, indices=linker_ids)
 
-    # 2) ADD TERMINAL MONOMERS TO CHAIN
+    # 3) ADD TERMINAL MONOMERS TO CHAIN
     term_iters = { # need to convert to iterators to allow for generator-like advancement (required for term group selection to behave as expected)
         resname : iter(rdmol_list)   # made necessary by annoying list-bound structure of current substructure spec
             for resname, rdmol_list in monomers.rdmols(term_only=True).items() 
@@ -95,16 +115,16 @@ def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A',
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer)
         chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
 
-    # 3) ASSEMBLE AND RETURN CHAIN
-    n_atoms = estimate_chain_len_linear(monomers, DOP)
-    LOGGER.info(f'Assembling linear polymer chain with {DOP} monomers ({n_atoms} atoms)')
-    chain.build(DOP - 2, sequence=sequence, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
+    # 4) ASSEMBLE AND RETURN CHAIN
+    n_atoms_est = estimate_chain_len_linear(monomers, DOP) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
+    LOGGER.info(f'Assembling linear {DOP}-mer chain (estimated {n_atoms_est} atoms)')
+    chain.build(n_seq_reps, sequence=sequence, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
     for atom in chain.particles():
         atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings)
-    LOGGER.info(f'Successfully assembled linear polymer chain with {DOP} monomers ({n_atoms} atoms)')
+    LOGGER.info(f'Successfully assembled linear {DOP}-mer chain (exactly {chain.n_particles} atoms)')
     
     if energy_minimize:
-        LOGGER.info('Energy-minimizing chain to find more stabile conformer')
+        LOGGER.info('Energy-minimizing chain to find more stable conformer')
         chain.energy_minimize()
         LOGGER.info('Energy minimization completed')
 

From c18d394f2368bec1fe37b10a4ca4b1817a557003 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 3 Dec 2024 21:02:43 -0700
Subject: [PATCH 05/78] Established placeholder file + sample fragments for
 polymer building unit tests

---
 polymerist/tests/data/peg-pla-pga.json     | 38 ++++++++++++++++++++++
 polymerist/tests/polymers/test_building.py | 18 ++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 polymerist/tests/data/peg-pla-pga.json
 create mode 100644 polymerist/tests/polymers/test_building.py

diff --git a/polymerist/tests/data/peg-pla-pga.json b/polymerist/tests/data/peg-pla-pga.json
new file mode 100644
index 0000000..2db06af
--- /dev/null
+++ b/polymerist/tests/data/peg-pla-pga.json
@@ -0,0 +1,38 @@
+{
+    "__class__": "MonomerGroup",
+    "__values__": {
+        "monomers": {
+            "PEG-1A": [
+                "[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7])-[#1D1+0:5]"
+            ],
+            "PEG-1B": [
+                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#8D2+0:5]-[#1D1+0:10])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]"
+            ],
+            "PEG-2": [
+                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[*:5])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]"
+            ],
+            "PLA-1A": [
+                "[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:4](=[#8D1+0:5])-[*:6])-[#1D1+0:8])-[#1D1+0:7]"
+            ],
+            "PLA-1B": [
+                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[#8D2+0:7]-[#1D1+0:12])-[#1D1+0:8]"
+            ],
+            "PLA-2": [
+                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[*:7])-[#1D1+0:8]"
+            ],
+            "PGA-1A": [
+                "[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]"
+            ],
+            "PGA-1B": [
+                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]"
+            ],
+            "PGA-2": [
+                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]"
+            ]
+        },
+        "term_orient": {
+            "PEG-1A": "head",
+            "PEG_1B": "tail"
+        }
+    }
+}
\ No newline at end of file
diff --git a/polymerist/tests/polymers/test_building.py b/polymerist/tests/polymers/test_building.py
new file mode 100644
index 0000000..799123f
--- /dev/null
+++ b/polymerist/tests/polymers/test_building.py
@@ -0,0 +1,18 @@
+'''Unit tests for `attrs` package'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+import pytest
+from pathlib import Path
+
+from polymerist.genutils.importutils.pkginspect import get_file_path_within_package
+from polymerist.tests import data as testdata
+
+from polymerist.polymers import building
+
+@pytest.fixture
+def fragments_path() -> Path:
+    return get_file_path_within_package('peg=pla-pga.json', testdata)
+
+# Also add separate tests module for polymers.estimation

From 4cc360d179bd260c836c68709f70c515fbcf5c19 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 3 Dec 2024 21:13:21 -0700
Subject: [PATCH 06/78] Added internal used-monomer-only MonomerGroup which
 improves accuracy of n_atoms estimate

---
 polymerist/polymers/building.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 8ccd864..42aaaeb 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -94,7 +94,10 @@ def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A',
     # ...for example, could allow 5/2 * 'BACA' to be interpreted as 'BACA|BACA|BA'; 5/3 * 'BACA' would still be invalid though
     LOGGER.info(f'Target chain length achievable with {n_seq_reps} block sequence repeat(s) ({n_seq_reps}*{block_size} [{sequence}] middle monomers + {n_terminal} terminal monomers = {DOP} total monomers)')
 
-    # 2) ADD MIDDLE MONOMERS TO CHAIN
+    # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
+    monomers_used = MonomerGroup() # used to track and estimate sized of the monomers being used
+    
+    ## 2A) ADD MIDDLE MONOMERS TO CHAIN
     chain = MBPolymer() 
     for (resname, middle_monomer), sequence_key in zip(
             monomers.iter_rdmols(term_only=False),
@@ -103,8 +106,9 @@ def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A',
         LOGGER.info(f'Registering middle monomer {resname} (block identifier "{sequence_key}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer)
         chain.add_monomer(compound=mb_monomer, indices=linker_ids)
+        monomers_used.monomers[resname] = monomers.monomers[resname]
 
-    # 3) ADD TERMINAL MONOMERS TO CHAIN
+    ## 2B) ADD TERMINAL MONOMERS TO CHAIN
     term_iters = { # need to convert to iterators to allow for generator-like advancement (required for term group selection to behave as expected)
         resname : iter(rdmol_list)   # made necessary by annoying list-bound structure of current substructure spec
             for resname, rdmol_list in monomers.rdmols(term_only=True).items() 
@@ -114,9 +118,10 @@ def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A',
         term_monomer = next(term_iters[resname])
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer)
         chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
+        monomers_used.monomers[resname] = monomers.monomers[resname]
 
-    # 4) ASSEMBLE AND RETURN CHAIN
-    n_atoms_est = estimate_chain_len_linear(monomers, DOP) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
+    # 3) ASSEMBLE AND RETURN CHAIN
+    n_atoms_est = estimate_chain_len_linear(monomers_used, DOP) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
     LOGGER.info(f'Assembling linear {DOP}-mer chain (estimated {n_atoms_est} atoms)')
     chain.build(n_seq_reps, sequence=sequence, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
     for atom in chain.particles():

From 173990cb29d0cbd8a5314dfb8d68288f1ce4e53f Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 3 Dec 2024 21:30:15 -0700
Subject: [PATCH 07/78] Deprecated DOP alias for n_monomers property

---
 polymerist/polymers/monographs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/polymerist/polymers/monographs.py b/polymerist/polymers/monographs.py
index 28f6fcd..8c9b591 100644
--- a/polymerist/polymers/monographs.py
+++ b/polymerist/polymers/monographs.py
@@ -38,7 +38,6 @@ def get_flavor_dict_at_node_index(self, node_idx : int) -> Optional[dict[int, in
     def num_monomers(self) -> int:
         '''Number of monomer units represented in the current polymer'''
         return self.number_of_nodes()
-    DOP = num_monomers
 
     @property
     def is_unbranched(self) -> bool:

From 8eb726790c34852138d7c1e25fa45ffa5c3122ee Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 3 Dec 2024 21:42:03 -0700
Subject: [PATCH 08/78] Expunged all references to "DOP" in favor of clearer
 terminology

---
 polymerist/polymers/building.py   | 20 +++++++-------
 polymerist/polymers/estimation.py | 45 ++++++++++++++++---------------
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 42aaaeb..5105113 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -17,7 +17,7 @@
 from rdkit import Chem
 
 from .exceptions import InsufficientChainLengthError, MorphologyError
-from .estimation import estimate_chain_len_linear
+from .estimation import estimate_n_atoms_linear
 
 from ..genutils.decorators.functional import allow_string_paths
 from ..genutils.textual.strsearch import uniquify_str
@@ -63,7 +63,7 @@ def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int
 
 
 # LINEAR POLYMER BUILDING
-def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A', add_Hs : bool=False, energy_minimize : bool=False) -> MBPolymer:
+def build_linear_polymer(monomers : MonomerGroup, n_monomers : int, sequence : str='A', add_Hs : bool=False, energy_minimize : bool=False) -> MBPolymer:
     '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON)
     and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
     # 0) VERIFY THAT CHAIN ACTUAL CAN DEFINE LINEAR POLYMER
@@ -84,15 +84,15 @@ def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A',
     n_terminal = len(term_orient) # determine how many terminal monomers are actually present and well-defined
     block_size = len(sequence)
     
-    if ((DOP - n_terminal) % block_size) != 0:
-        raise ValueError(f'Cannot build a(n) {DOP}-monomer chain from any number of {block_size}-monomer blocks and {n_terminal} end groups')
+    if ((n_monomers - n_terminal) % block_size) != 0:
+        raise ValueError(f'Cannot build a(n) {n_monomers}-monomer chain from any number of {block_size}-monomer blocks and {n_terminal} end groups')
     # NOTE: not explicitly forcing n_seq_reps to catch lingering float input / inexact division errors
-    n_seq_reps = (DOP - n_terminal) // block_size # number of times to repeat the block sequence between end groups to reach the target chain length
+    n_seq_reps = (n_monomers - n_terminal) // block_size # number of times to repeat the block sequence between end groups to reach the target chain length
     if n_seq_reps < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced by hand
-        raise InsufficientChainLengthError(f'{DOP}-monomer chain has few total monomers to accomodate {n_terminal} end groups AND at least 1 middle monomer sequence')
+        raise InsufficientChainLengthError(f'{n_monomers}-monomer chain has few total monomers to accomodate {n_terminal} end groups AND at least 1 middle monomer sequence')
     # TODO: consider adding support for fractional sequence lengths IFF that fraction is a rational number whose denominator divides the sequence length...
     # ...for example, could allow 5/2 * 'BACA' to be interpreted as 'BACA|BACA|BA'; 5/3 * 'BACA' would still be invalid though
-    LOGGER.info(f'Target chain length achievable with {n_seq_reps} block sequence repeat(s) ({n_seq_reps}*{block_size} [{sequence}] middle monomers + {n_terminal} terminal monomers = {DOP} total monomers)')
+    LOGGER.info(f'Target chain length achievable with {n_seq_reps} block sequence repeat(s) ({n_seq_reps}*{block_size} [{sequence}] middle monomers + {n_terminal} terminal monomers = {n_monomers} total monomers)')
 
     # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
     monomers_used = MonomerGroup() # used to track and estimate sized of the monomers being used
@@ -121,12 +121,12 @@ def build_linear_polymer(monomers : MonomerGroup, DOP : int, sequence : str='A',
         monomers_used.monomers[resname] = monomers.monomers[resname]
 
     # 3) ASSEMBLE AND RETURN CHAIN
-    n_atoms_est = estimate_chain_len_linear(monomers_used, DOP) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
-    LOGGER.info(f'Assembling linear {DOP}-mer chain (estimated {n_atoms_est} atoms)')
+    n_atoms_est = estimate_n_atoms_linear(monomers_used, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
+    LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)')
     chain.build(n_seq_reps, sequence=sequence, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
     for atom in chain.particles():
         atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings)
-    LOGGER.info(f'Successfully assembled linear {DOP}-mer chain (exactly {chain.n_particles} atoms)')
+    LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {chain.n_particles} atoms)')
     
     if energy_minimize:
         LOGGER.info('Energy-minimizing chain to find more stable conformer')
diff --git a/polymerist/polymers/estimation.py b/polymerist/polymers/estimation.py
index fa56799..37d2784 100644
--- a/polymerist/polymers/estimation.py
+++ b/polymerist/polymers/estimation.py
@@ -12,10 +12,10 @@
 from ..rdutils.bonding.portlib import get_num_ports
 
 
-def estimate_chain_len_linear(monomers : MonomerGroup, DOP : int) -> int:
+def estimate_n_atoms_linear(monomers : MonomerGroup, n_monomers : int) -> int:
     '''Given a set of monomers and the desired degree of polymerization, estimate the length of the resulting chain
     !NOTE! : As-implemented, only works for linear homopolymers and block copolymers with equal an distribution of monomers'''
-    # TOSELF : omitted logging for now, as it gets repeated on EVERY cycle in when called estimate_DOP_lower
+    # TOSELF : omitted logging for now, as it gets repeated on EVERY cycle in when called estimate_n_monomers_supremum()
     num_mono = monomers.n_monomers
     mono_term    = np.zeros(num_mono, dtype=bool) # terminality of each monomer (i.e. whether or not it is a term group)
     mono_multip  = np.zeros(num_mono, dtype=int) # multiplicity of each polymer (i.e. how many times is occurs in a chain)
@@ -32,27 +32,30 @@ def estimate_chain_len_linear(monomers : MonomerGroup, DOP : int) -> int:
 
     num_term = sum(mono_term)
     num_mid  = num_mono - num_term # assumed that all monomers are either terminal or not
-    mono_multip[~mono_term] = (DOP - num_term) / num_mid # naive assumption that all middle monomers contribute rest of chain equally (for homopolymers, this is always true)
+    mono_multip[~mono_term] = (n_monomers - num_term) / num_mid # naive assumption that all middle monomers contribute rest of chain equally (for homopolymers, this is always true)
 
     N = mono_contrib @ mono_multip # compute dot product to yield final count
     
     return N
 
-def estimate_DOP_lower(monomers : MonomerGroup, max_chain_len : int, min_DOP : int=3) -> int:
-    '''Returns the largest DOP for a set of monomers which yields a chain no longer than the specified chain length'''
-    base_chain_len = estimate_chain_len_linear(monomers, min_DOP)
-    if base_chain_len > max_chain_len: # pre-check when optimization is impossible
-        raise InsufficientChainLengthError(f'Even shortest possible chain (DOP={min_DOP}, N={base_chain_len}) is longer than the specified max length of {max_chain_len} atoms')
-
-    DOP = min_DOP 
-    while estimate_chain_len_linear(monomers, DOP + 1) < max_chain_len: # check if adding 1 more monomer keeps the length below the threshold
-        DOP += 1
-
-    return DOP
-
-def estimate_DOP_upper(monomers : MonomerGroup, min_chain_len : int, min_DOP : int=3) -> int: # NOTE : as currently defined, this also subsumes the case when the estimate and calculated length are exactly equal
-    '''Returns the smallest DOP for a set of monomers which yields a chain no shorter than the specified chain length'''
-    return estimate_DOP_lower(monomers, min_chain_len, min_DOP=min_DOP) + 1 # by definition, this is just 1 monomer longer than the lower bound
-
-estimate_DOP_infimum  = estimate_DOP_upper # more descriptive aliases to alleviate confusion (originals kept in for backwards compatibility)
-estimate_DOP_supremum = estimate_DOP_lower # more descriptive aliases to alleviate confusion (originals kept in for backwards compatibility)
\ No newline at end of file
+def estimate_n_monomers_infimum(monomers : MonomerGroup, n_atoms_max : int, n_monomers_min : int=3) -> int:
+    '''
+    For a given collection of monomer fragments, returns the largest number of monomers which guarantees that
+    a polymer chain made up of those monomers will have no more than the specified maximum number of atoms
+    '''
+    n_atoms_base = estimate_n_atoms_linear(monomers, n_monomers_min)
+    if n_atoms_base > n_atoms_max: # pre-check when optimization is impossible
+        raise InsufficientChainLengthError(f'Even shortest possible chain ({n_monomers_min} monomers, with {n_atoms_base} atoms) is longer than the specified max length of {n_atoms_max} atoms')
+
+    n_monomers = n_monomers_min 
+    while estimate_n_atoms_linear(monomers, n_monomers + 1) < n_atoms_max: # check if adding 1 more monomer keeps the length below the threshold
+        n_monomers += 1
+
+    return n_monomers
+
+def estimate_n_monomers_supremum(monomers : MonomerGroup, n_atoms_min : int, n_monomers_min : int=3) -> int: # NOTE : as currently defined, this also subsumes the case when the estimate and calculated length are exactly equal
+    '''
+    For a given collection of monomer fragments, returns the smallest number of monomers which guarantees that
+    a polymer chain made up of those monomers will have no fewer than the specified minimum number of atoms
+    '''
+    return estimate_n_monomers_infimum(monomers, n_atoms_min, n_monomers_min=n_monomers_min) + 1 # by definition, a ny more monomers than the infimum guarantees the chain will surpass a given number of atoms
\ No newline at end of file

From 88f8b6b1d1ccd3b438ecbbf593ed6511924ea1b4 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 14:33:29 -0700
Subject: [PATCH 09/78] Deprecated filter_text_by_condition()

---
 polymerist/genutils/textual/strsearch.py | 30 +-----------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/polymerist/genutils/textual/strsearch.py b/polymerist/genutils/textual/strsearch.py
index 14e080a..ac52489 100644
--- a/polymerist/genutils/textual/strsearch.py
+++ b/polymerist/genutils/textual/strsearch.py
@@ -3,9 +3,6 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
-from typing import Callable, Optional
-from pathlib import Path
-
 
 def uniquify_str(string : str, preserve_order : bool=True) -> str:
     '''
@@ -44,29 +41,4 @@ def shortest_repeating_substring(string : str) -> str:
     '''Return the shortest substring such that the passed string can be written as some number of repeats (including 1) of the substring
     Will return the original string if no simpler decomposition exists'''
     i = (2*string).find(string, 1, -1) # check if string matches itself in a cycle in non-trivial way (i.e more than just the two repeats)
-    return string if (i == -1) else string[:i]
-
-def filter_text_by_condition(in_text_path : Path, condition : Callable[[str], bool], out_text_path : Optional[Path]=None, postfix : str='filtered', inclusive : bool=True, return_filtered_path : bool=False) -> Optional[Path]:
-    '''Create a copy of a text-based file containing only the lines which match to a given boolean condition
-    
-    If no explicit output path is given, will create an output file in the same directory as the source file
-    with the same name plus "postfix" tacked on. Can optionally return the path to the filtered file (else None)
-
-    "Inclusive" kw governs whether to write lines which DO or DON'T meet the condition'''
-    if out_text_path is None:
-        out_text_path = in_text_path.with_stem(f'{in_text_path.stem}{"_" if postfix else ""}{postfix}')
-
-    if (out_text_path == in_text_path):
-        raise PermissionError(f'Attempting to overwrite {in_text_path} with regex filter') # prevent write clash
-    
-    if (out_text_path.suffix != in_text_path.suffix):  # prevent file type conversion during transfer
-        raise ValueError(f'Input and output file must have same extension (not {in_text_path.suffix} and {out_text_path.suffix})')
-
-    with out_text_path.open('w') as outfile: 
-        with in_text_path.open('r') as infile: # readfile is innermost in case error occurs during file read (caught by handler one level up)
-            for line in infile:
-                if (condition(line) == inclusive): # only write lines if (matching AND inclusive) OR (not matching AND exclusive)
-                    outfile.write(line)
-
-    if return_filtered_path:
-        return out_text_path
\ No newline at end of file
+    return string if (i == -1) else string[:i]
\ No newline at end of file

From 98cec8906ed8b06edd563306ca55d07682f6d1a8 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 14:35:05 -0700
Subject: [PATCH 10/78] Renamed textual.strsearch to textual.substrings,
 updated docstring

---
 polymerist/genutils/textual/{strsearch.py => substrings.py} | 2 +-
 polymerist/polymers/building.py                             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename polymerist/genutils/textual/{strsearch.py => substrings.py} (94%)

diff --git a/polymerist/genutils/textual/strsearch.py b/polymerist/genutils/textual/substrings.py
similarity index 94%
rename from polymerist/genutils/textual/strsearch.py
rename to polymerist/genutils/textual/substrings.py
index ac52489..7a43815 100644
--- a/polymerist/genutils/textual/strsearch.py
+++ b/polymerist/genutils/textual/substrings.py
@@ -1,4 +1,4 @@
-'''For searching and replacing through strings and text files'''
+'''For identifying and concatenating substrings of other strings with unique properties'''
 
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 5105113..9b08067 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -20,7 +20,7 @@
 from .estimation import estimate_n_atoms_linear
 
 from ..genutils.decorators.functional import allow_string_paths
-from ..genutils.textual.strsearch import uniquify_str
+from ..genutils.textual.substrings import uniquify_str
 
 from ..rdutils.bonding.portlib import get_linker_ids
 from ..rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports

From de314e758ff010816539a51dfa7548518c552e20 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 14:35:46 -0700
Subject: [PATCH 11/78] Implemented function for repeating a string a (possibly
 fractional) number of times

---
 polymerist/genutils/textual/substrings.py | 38 ++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/polymerist/genutils/textual/substrings.py b/polymerist/genutils/textual/substrings.py
index 7a43815..d9b7662 100644
--- a/polymerist/genutils/textual/substrings.py
+++ b/polymerist/genutils/textual/substrings.py
@@ -41,4 +41,40 @@ def shortest_repeating_substring(string : str) -> str:
     '''Return the shortest substring such that the passed string can be written as some number of repeats (including 1) of the substring
     Will return the original string if no simpler decomposition exists'''
     i = (2*string).find(string, 1, -1) # check if string matches itself in a cycle in non-trivial way (i.e more than just the two repeats)
-    return string if (i == -1) else string[:i]
\ No newline at end of file
+    return string if (i == -1) else string[:i]
+
+def repeat_string_to_length(string : str, target_length : int) -> str:
+    '''
+    Takes a string and repeats it cyclically to produce another string of a given length
+    The number of times the original string occurs in the new string may be fractional
+    for example:
+    >> repeat_string_to_length("CAT", 6) -> "CATCAT"
+    >> repeat_string_to_length("BACA", 10) -> "BACABACABA"
+    
+    Parameters
+    ----------
+    string : str
+        An arbitrary string to repeat
+    target_length : int
+        The length of the final desired string
+        This does NOT have to be an integer multiple of the length of "string"
+            E.g. repeat_string_to_length("BACA", 10) -> "BACABACABA"
+        Nor does it have to be greater than the length of "string"
+            E.g. repeat_string_to_length("BACA", 3) -> "BAC"
+            
+    Returns
+    -------
+    rep_string : str
+        A new string which has the desired target length and consists of cycles of the initial string
+    '''
+    if not string:
+        raise ValueError(f'Cannot generate nonempty string from any amount of repeats of the empty string')
+    return (string*(target_length//len(string) + 1))[:target_length] # repeat to smallest # time
+    
+    # Implementation 2) more readable, but slightly slower in benchmark
+    # whole_reps, fract_reps = divmod(target_length, len(string))
+    # return whole_reps*string + string[fract_reps:]
+    
+    # Implementation 3) most compact, but introduces itertools dependency
+    # Interestingly, this yields empty string instead of division-by-zero error w/ empty string as input
+    # return ''.join(islice(cycle(string), target_length)) 
\ No newline at end of file

From 3685b396f754af6871ee74c61071a8a17a839059 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 15:40:36 -0700
Subject: [PATCH 12/78] Added argument for indicating separator between string
 repeats

---
 polymerist/genutils/textual/substrings.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/polymerist/genutils/textual/substrings.py b/polymerist/genutils/textual/substrings.py
index d9b7662..ea3d8e5 100644
--- a/polymerist/genutils/textual/substrings.py
+++ b/polymerist/genutils/textual/substrings.py
@@ -43,7 +43,7 @@ def shortest_repeating_substring(string : str) -> str:
     i = (2*string).find(string, 1, -1) # check if string matches itself in a cycle in non-trivial way (i.e more than just the two repeats)
     return string if (i == -1) else string[:i]
 
-def repeat_string_to_length(string : str, target_length : int) -> str:
+def repeat_string_to_length(string : str, target_length : int, join_indicator : str='') -> str:
     '''
     Takes a string and repeats it cyclically to produce another string of a given length
     The number of times the original string occurs in the new string may be fractional
@@ -69,12 +69,9 @@ def repeat_string_to_length(string : str, target_length : int) -> str:
     '''
     if not string:
         raise ValueError(f'Cannot generate nonempty string from any amount of repeats of the empty string')
-    return (string*(target_length//len(string) + 1))[:target_length] # repeat to smallest # time
     
-    # Implementation 2) more readable, but slightly slower in benchmark
-    # whole_reps, fract_reps = divmod(target_length, len(string))
-    # return whole_reps*string + string[fract_reps:]
+    num_str_reps, num_extra_chars = divmod(target_length, len(string))
+    remainder = (string[:num_extra_chars]) if num_extra_chars else () # empty container avoids extra joiner at end when remainder string is empty
     
-    # Implementation 3) most compact, but introduces itertools dependency
-    # Interestingly, this yields empty string instead of division-by-zero error w/ empty string as input
-    # return ''.join(islice(cycle(string), target_length)) 
\ No newline at end of file
+    return join_indicator.join(num_str_reps*(string,) + remainder) # tuples here are ~2 OOM faster than moral equivalent with lists
+    
\ No newline at end of file

From 497791e169db0c7cd5a611faaef6f79e4d49f293 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 15:41:06 -0700
Subject: [PATCH 13/78] Renamed uniquify_str() to unique_string()

---
 polymerist/genutils/textual/substrings.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/polymerist/genutils/textual/substrings.py b/polymerist/genutils/textual/substrings.py
index ea3d8e5..d6192db 100644
--- a/polymerist/genutils/textual/substrings.py
+++ b/polymerist/genutils/textual/substrings.py
@@ -4,7 +4,7 @@
 __email__ = 'timotej.bernat@colorado.edu'
 
 
-def uniquify_str(string : str, preserve_order : bool=True) -> str:
+def unique_string(string : str, preserve_order : bool=True) -> str:
     '''
     Accepts a string and returns another string containing
     only the UNIQUE characters in the origin string
@@ -18,8 +18,8 @@ def uniquify_str(string : str, preserve_order : bool=True) -> str:
     preserve_order : bool, default True
         Whether or not to keep the unique characters in the order they are found
         For example: 
-            uniquify_str("balaclava", preserve_order=False) -> "bcavl"
-            uniquify_str("balaclava", preserve_order=True) -> "balcv"
+            unique_string("balaclava", preserve_order=False) -> "bcavl"
+            unique_string("balaclava", preserve_order=True) -> "balcv"
         
     Returns
     -------

From 3c7448e9f728d651bbcb7f6240ba5ff44a325d1a Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 15:48:40 -0700
Subject: [PATCH 14/78] Renamed "join_indicator" to "joiner" for brevity

---
 polymerist/genutils/textual/substrings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/polymerist/genutils/textual/substrings.py b/polymerist/genutils/textual/substrings.py
index d6192db..924d2aa 100644
--- a/polymerist/genutils/textual/substrings.py
+++ b/polymerist/genutils/textual/substrings.py
@@ -43,7 +43,7 @@ def shortest_repeating_substring(string : str) -> str:
     i = (2*string).find(string, 1, -1) # check if string matches itself in a cycle in non-trivial way (i.e more than just the two repeats)
     return string if (i == -1) else string[:i]
 
-def repeat_string_to_length(string : str, target_length : int, join_indicator : str='') -> str:
+def repeat_string_to_length(string : str, target_length : int, joiner : str='') -> str:
     '''
     Takes a string and repeats it cyclically to produce another string of a given length
     The number of times the original string occurs in the new string may be fractional
@@ -73,5 +73,5 @@ def repeat_string_to_length(string : str, target_length : int, join_indicator :
     num_str_reps, num_extra_chars = divmod(target_length, len(string))
     remainder = (string[:num_extra_chars]) if num_extra_chars else () # empty container avoids extra joiner at end when remainder string is empty
     
-    return join_indicator.join(num_str_reps*(string,) + remainder) # tuples here are ~2 OOM faster than moral equivalent with lists
+    return joiner.join(num_str_reps*(string,) + remainder) # tuples here are ~2 OOM faster than moral equivalent with lists
     
\ No newline at end of file

From 13c9db7820c8ebef824a6a90683dd6010845661a Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 16:11:37 -0700
Subject: [PATCH 15/78] Fixed bug with parenthesization vs tuplification

---
 polymerist/genutils/textual/substrings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polymerist/genutils/textual/substrings.py b/polymerist/genutils/textual/substrings.py
index 924d2aa..2d9041f 100644
--- a/polymerist/genutils/textual/substrings.py
+++ b/polymerist/genutils/textual/substrings.py
@@ -71,7 +71,7 @@ def repeat_string_to_length(string : str, target_length : int, joiner : str='')
         raise ValueError(f'Cannot generate nonempty string from any amount of repeats of the empty string')
     
     num_str_reps, num_extra_chars = divmod(target_length, len(string))
-    remainder = (string[:num_extra_chars]) if num_extra_chars else () # empty container avoids extra joiner at end when remainder string is empty
+    remainder = (string[:num_extra_chars],) if num_extra_chars else () # empty container avoids extra joiner at end when remainder string is empty
     
     return joiner.join(num_str_reps*(string,) + remainder) # tuples here are ~2 OOM faster than moral equivalent with lists
     
\ No newline at end of file

From d3c8be7a2caa979ab6a71c4b9877e6cc0c00a477 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 16:11:47 -0700
Subject: [PATCH 16/78] Wrote unit tests for textual.substrings

---
 .../tests/genutils/textual/test_substrings.py | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 polymerist/tests/genutils/textual/test_substrings.py

diff --git a/polymerist/tests/genutils/textual/test_substrings.py b/polymerist/tests/genutils/textual/test_substrings.py
new file mode 100644
index 0000000..a7ab00a
--- /dev/null
+++ b/polymerist/tests/genutils/textual/test_substrings.py
@@ -0,0 +1,64 @@
+'''Unit tests for `substrings` package'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+import pytest
+
+from polymerist.genutils.textual.substrings import unique_string, shortest_repeating_substring, repeat_string_to_length
+
+
+@pytest.mark.parametrize('string', ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit'])
+def test_unique_str_unordered(string : str) -> None:
+    '''Test that unique characters are coorectly identified WITHOUT respect to order'''
+    assert set(unique_string(string, preserve_order=False)) == set(string)
+
+@pytest.mark.parametrize('string, expected_output',
+    [
+        ('aaaaa', 'a'),
+        ('BABAB', 'BA'),
+        ('balaclava', 'balcv'),
+        ('catamaran', 'catmrn'),
+        ('unique', 'uniqe'), # self-reference makes everything better :P
+        ('singular', 'singular'), # test string with aready-unique characters are unaffected
+    ]
+)
+def test_unique_str_ordered(string : str, expected_output : str) -> None:
+    '''Test that unique characters are coorectly identified WITH respect to order'''
+    assert unique_string(string, preserve_order=True) == expected_output
+    
+    
+@pytest.mark.parametrize('string, expected_output',
+    [
+        ('aaaaa', 'a'),
+        ('booboo', 'boo'),
+        ('piripiri', 'piri'),
+        ('ababab', 'ab'),
+        ('bcbabcbabcba', 'bcba'),
+        # sequences which do not repeat a whole-number of times
+        ('no repeats', 'no repeats'),
+        ('ababa', 'ababa'), 
+        ('bonobo', 'bonobo'),
+    ]
+)
+def test_shortest_repeating_substring(string : str, expected_output : str) -> None:
+    '''Test that minimal repeating substrings are correctly identified'''
+    assert shortest_repeating_substring(string) == expected_output
+    
+    
+@pytest.mark.parametrize('string, target_length, joiner, expected_output',
+    [
+        ('BACA', 10, '', 'BACABACABA'), # expected "standard" use case
+        ('BACA', 1, '', 'B'),           # test case where target length is shorter than the whole string
+        ('BACA', 0, '', ''),            # test that no repeats yields the empty string
+        ('BACA', 4, '', 'BACA'),             # test precisely one repeat without joins
+        ('BACA', 10, '|', 'BACA|BACA|BA'),   # test joiners
+        ('BACA', 4, '|', 'BACA'),            # test no joiners are added when exactly one string repeat occurs
+        ('BACA', 12, '|', 'BACA|BACA|BACA'), # test no extraneous joiners are included for purely-whole number of repeats
+        ('CAT', 5, '', 'CATCA'), # test with triads (and different base string)
+        pytest.param('', 7, '', None, marks=pytest.mark.xfail(raises=ValueError, reason='Empty string can\'t be repeated into nonempty string', strict=True)),
+    ]
+)
+def test_repeat_string_to_length(string : str, target_length : int, joiner : str, expected_output : str) -> None:
+    '''Test that string repetition to a given length returns the expected string WITH joingin characters present'''
+    assert repeat_string_to_length(string, target_length=target_length, joiner=joiner) == expected_output
\ No newline at end of file

From 487ae6a910e7847ffc4dae520ccb02cc6d6da38a Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 16:58:00 -0700
Subject: [PATCH 17/78] Delayed monomer linearity check to only be on the
 monomer fragments selected for building

---
 polymerist/polymers/building.py | 43 +++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 9b08067..1705385 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -20,7 +20,7 @@
 from .estimation import estimate_n_atoms_linear
 
 from ..genutils.decorators.functional import allow_string_paths
-from ..genutils.textual.substrings import uniquify_str
+from ..genutils.textual.substrings import unique_string
 
 from ..rdutils.bonding.portlib import get_linker_ids
 from ..rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports
@@ -63,13 +63,17 @@ def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int
 
 
 # LINEAR POLYMER BUILDING
-def build_linear_polymer(monomers : MonomerGroup, n_monomers : int, sequence : str='A', add_Hs : bool=False, energy_minimize : bool=False) -> MBPolymer:
+def build_linear_polymer(
+        monomers : MonomerGroup,
+        n_monomers : int,
+        sequence : str='A',
+        allow_partial_sequences : bool=True,
+        add_Hs : bool=False,
+        energy_minimize : bool=False,
+    ) -> MBPolymer:
     '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON)
     and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
-    # 0) VERIFY THAT CHAIN ACTUAL CAN DEFINE LINEAR POLYMER
-    if not monomers.is_linear:
-        raise MorphologyError('Linear polymer building does not support non-linear monomer input')
-    
+    # 0) DETERMINE THE ORIENTATION AND NUMBER OF TERMINAL MONOMERS, SUPPLYING THIS IF AN INVALID DEFINITION IS PROVIDED
     if monomers.has_valid_linear_term_orient: # DEV: consider moving this logic into MonomerGroup
         term_orient = monomers.term_orient
         LOGGER.info(f'Using pre-defined terminal group orientation {term_orient}')
@@ -81,32 +85,32 @@ def build_linear_polymer(monomers : MonomerGroup, n_monomers : int, sequence : s
         LOGGER.warning(f'No valid terminal monomer orientations defined; autogenerated orientations "{term_orient}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
 
     # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE)
-    n_terminal = len(term_orient) # determine how many terminal monomers are actually present and well-defined
+    n_mono_term = len(term_orient) # determine how many terminal monomers are actually present and well-defined
+    n_mono_middle = n_monomers - n_mono_term # in a linear chain, all monomers are either middle of terminal
     block_size = len(sequence)
     
-    if ((n_monomers - n_terminal) % block_size) != 0:
-        raise ValueError(f'Cannot build a(n) {n_monomers}-monomer chain from any number of {block_size}-monomer blocks and {n_terminal} end groups')
+    if (n_mono_middle % block_size) != 0:
+        raise ValueError(f'Cannot build a(n) {n_monomers}-monomer chain from any number of {block_size}-monomer blocks and {n_mono_term} end groups')
     # NOTE: not explicitly forcing n_seq_reps to catch lingering float input / inexact division errors
-    n_seq_reps = (n_monomers - n_terminal) // block_size # number of times to repeat the block sequence between end groups to reach the target chain length
+    n_seq_reps = n_mono_middle // block_size # number of times to repeat the block sequence between end groups to reach the target chain length
     if n_seq_reps < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced by hand
-        raise InsufficientChainLengthError(f'{n_monomers}-monomer chain has few total monomers to accomodate {n_terminal} end groups AND at least 1 middle monomer sequence')
+        raise InsufficientChainLengthError(f'{n_monomers}-monomer chain has few total monomers to accomodate {n_mono_term} end groups AND at least 1 middle monomer sequence')
     # TODO: consider adding support for fractional sequence lengths IFF that fraction is a rational number whose denominator divides the sequence length...
     # ...for example, could allow 5/2 * 'BACA' to be interpreted as 'BACA|BACA|BA'; 5/3 * 'BACA' would still be invalid though
-    LOGGER.info(f'Target chain length achievable with {n_seq_reps} block sequence repeat(s) ({n_seq_reps}*{block_size} [{sequence}] middle monomers + {n_terminal} terminal monomers = {n_monomers} total monomers)')
+    LOGGER.info(f'Target chain length achievable with {n_seq_reps} block sequence repeat(s) ({n_seq_reps}*{block_size} [{sequence}] middle monomers + {n_mono_term} terminal monomers = {n_monomers} total monomers)')
 
     # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
-    monomers_used = MonomerGroup() # used to track and estimate sized of the monomers being used
-    
+    monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building
     ## 2A) ADD MIDDLE MONOMERS TO CHAIN
     chain = MBPolymer() 
     for (resname, middle_monomer), sequence_key in zip(
             monomers.iter_rdmols(term_only=False),
-            uniquify_str(sequence, preserve_order=True), # only register a new monomer for each appearance of a new indicator in the sequence
+            unique_string(sequence, preserve_order=True), # only register a new monomer for each appearance of a new indicator in the sequence
         ): # zip with sequence limits number of middle monomers to length of block sequence
         LOGGER.info(f'Registering middle monomer {resname} (block identifier "{sequence_key}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer)
         chain.add_monomer(compound=mb_monomer, indices=linker_ids)
-        monomers_used.monomers[resname] = monomers.monomers[resname]
+        monomers_selected.monomers[resname] = monomers.monomers[resname]
 
     ## 2B) ADD TERMINAL MONOMERS TO CHAIN
     term_iters = { # need to convert to iterators to allow for generator-like advancement (required for term group selection to behave as expected)
@@ -118,10 +122,13 @@ def build_linear_polymer(monomers : MonomerGroup, n_monomers : int, sequence : s
         term_monomer = next(term_iters[resname])
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer)
         chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
-        monomers_used.monomers[resname] = monomers.monomers[resname]
+        monomers_selected.monomers[resname] = monomers.monomers[resname]
 
     # 3) ASSEMBLE AND RETURN CHAIN
-    n_atoms_est = estimate_n_atoms_linear(monomers_used, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
+    if not monomers_selected.is_linear: # verify the selected monomers actually define a linear polymer
+        raise MorphologyError('Linear polymer building does not support non-linear monomer input')
+    
+    n_atoms_est = estimate_n_atoms_linear(monomers_selected, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
     LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)')
     chain.build(n_seq_reps, sequence=sequence, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
     for atom in chain.particles():

From ddc20bcac532f76d57db614c8e362d9041d49aa8 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 17:06:22 -0700
Subject: [PATCH 18/78] Added range and int typing checks to target_length

---
 polymerist/genutils/textual/substrings.py            | 4 ++++
 polymerist/tests/genutils/textual/test_substrings.py | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/polymerist/genutils/textual/substrings.py b/polymerist/genutils/textual/substrings.py
index 2d9041f..83c6f4f 100644
--- a/polymerist/genutils/textual/substrings.py
+++ b/polymerist/genutils/textual/substrings.py
@@ -69,6 +69,10 @@ def repeat_string_to_length(string : str, target_length : int, joiner : str='')
     '''
     if not string:
         raise ValueError(f'Cannot generate nonempty string from any amount of repeats of the empty string')
+    if not isinstance(target_length, int):
+        raise TypeError(f'Only integer target string lengths are allowed, not non-integer type "{type(target_length).__name__}"')
+    if target_length < 0:
+        raise IndexError(f'Cannot generate a string of negative length (requested length of {target_length} character(s))')
     
     num_str_reps, num_extra_chars = divmod(target_length, len(string))
     remainder = (string[:num_extra_chars],) if num_extra_chars else () # empty container avoids extra joiner at end when remainder string is empty
diff --git a/polymerist/tests/genutils/textual/test_substrings.py b/polymerist/tests/genutils/textual/test_substrings.py
index a7ab00a..1130812 100644
--- a/polymerist/tests/genutils/textual/test_substrings.py
+++ b/polymerist/tests/genutils/textual/test_substrings.py
@@ -56,7 +56,9 @@ def test_shortest_repeating_substring(string : str, expected_output : str) -> No
         ('BACA', 4, '|', 'BACA'),            # test no joiners are added when exactly one string repeat occurs
         ('BACA', 12, '|', 'BACA|BACA|BACA'), # test no extraneous joiners are included for purely-whole number of repeats
         ('CAT', 5, '', 'CATCA'), # test with triads (and different base string)
-        pytest.param('', 7, '', None, marks=pytest.mark.xfail(raises=ValueError, reason='Empty string can\'t be repeated into nonempty string', strict=True)),
+        pytest.param(''   ,   7, '', None, marks=pytest.mark.xfail(raises=ValueError, reason='Empty string can\'t be repeated into nonempty string', strict=True)),
+        pytest.param('CAT', 4.2, '', None, marks=pytest.mark.xfail(raises=TypeError , reason='Non-integer string length doesn\'t make sense', strict=True)),
+        pytest.param('CAT',  -1, '', None, marks=pytest.mark.xfail(raises=IndexError, reason='Can\'t have string with fewer than 0 characters', strict=True)),
     ]
 )
 def test_repeat_string_to_length(string : str, target_length : int, joiner : str, expected_output : str) -> None:

From 8714eb841a4d8081240571d2e1b46fa7632929c2 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 17:56:28 -0700
Subject: [PATCH 19/78] Added option to register residue names when converting
 a spec SMARTS fragment into an mBuild Compound

---
 polymerist/polymers/building.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 1705385..54f119c 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -6,6 +6,8 @@
 import logging
 LOGGER = logging.getLogger(__name__)
 
+from typing import Optional
+
 import warnings
 with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings
     warnings.filterwarnings('ignore',  category=DeprecationWarning)
@@ -31,23 +33,29 @@
 
 
 # CONVERSION
-def mbmol_from_mono_rdmol(rdmol : Chem.Mol) -> tuple[Compound, list[int]]:
-    '''Accepts a monomer-spec-compliant SMARTS string and returns an mbuild Compound and a list of the indices of atom ports'''
+def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tuple[Compound, list[int]]:
+    '''
+    Accepts a monomer-spec-compliant SMARTS string and returns an mbuild Compound and a list of the indices of atom ports
+    If "resname" is provided, will assign that name to the mBuild Compound returned
+    '''
     linker_ids = [i for i in get_linker_ids(rdmol)] # record indices of ports - MUST unpack generator for mbuild compatibility
     
     # create port-free version of molecule which RDKit can embed without errors
     prot_mol = hydrogenate_rdmol_ports(rdmol, in_place=False)
     # prot_mol = saturate_ports(rdmol) # TOSELF : custom, port-based saturation methods are not yet ready for deployment - yield issues in RDKit representation under-the-hood 
     Chem.SanitizeMol(prot_mol, sanitizeOps=SANITIZE_AS_KEKULE) # ensure Mol is valid (avoids implicitValence issues)
+    
     mb_compound = mb.conversion.from_rdkit(prot_mol) # native from_rdkit() method actually appears to preserve atom ordering
+    if resname is not None:
+        mb_compound.name = resname
 
     return mb_compound, linker_ids
 
 @allow_string_paths
-def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int=2, res_repl : dict[str, str]=None) -> None:
+def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int=2, resname_repl : dict[str, str]=None) -> None:
     '''Save an MBuild Compound into an OpenMM-compatible PDB file'''
-    if res_repl is None: # avoid mutable default
-        res_repl = {'RES' : 'Pol'} 
+    if resname_repl is None: # avoid mutable default
+        resname_repl = {'RES' : 'Pol'} 
 
     traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format)
     omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory
@@ -58,7 +66,7 @@ def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int
         positions=omm_pos,
         uniquify_atom_ids=True,
         num_atom_id_digits=num_atom_digits,
-        resname_repl=res_repl
+        resname_repl=resname_repl
     )
 
 
@@ -108,7 +116,7 @@ def build_linear_polymer(
             unique_string(sequence, preserve_order=True), # only register a new monomer for each appearance of a new indicator in the sequence
         ): # zip with sequence limits number of middle monomers to length of block sequence
         LOGGER.info(f'Registering middle monomer {resname} (block identifier "{sequence_key}")')
-        mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer)
+        mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname)
         chain.add_monomer(compound=mb_monomer, indices=linker_ids)
         monomers_selected.monomers[resname] = monomers.monomers[resname]
 
@@ -118,9 +126,9 @@ def build_linear_polymer(
             for resname, rdmol_list in monomers.rdmols(term_only=True).items() 
     }
     for resname, head_or_tail in term_orient.items():
-        LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")')
         term_monomer = next(term_iters[resname])
-        mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer)
+        LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")')
+        mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer, resname=resname)
         chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
         monomers_selected.monomers[resname] = monomers.monomers[resname]
 

From 0fe10e91f4706ca1e57a30ee74c0ececa581745c Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 17:57:43 -0700
Subject: [PATCH 20/78] Renamed "resname_repl" to "resname_map" throughout

---
 polymerist/mdtools/openmmtools/serialization.py | 8 ++++----
 polymerist/polymers/building.py                 | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/polymerist/mdtools/openmmtools/serialization.py b/polymerist/mdtools/openmmtools/serialization.py
index 6bad38a..521e8f0 100644
--- a/polymerist/mdtools/openmmtools/serialization.py
+++ b/polymerist/mdtools/openmmtools/serialization.py
@@ -120,11 +120,11 @@ def serialize_system(sys_path : Path, system : System) -> None:
 
 @allow_string_paths
 def serialize_openmm_pdb(pdb_path : Path, topology : OpenMMTopology, positions : Union[NDArray, list[Vec3]], keep_chain_and_res_ids : bool=True,
-                         uniquify_atom_ids : bool=True, num_atom_id_digits : int=2, resname_repl : Optional[dict[str, str]]=None) -> None:
+                         uniquify_atom_ids : bool=True, num_atom_id_digits : int=2, resname_map : Optional[dict[str, str]]=None) -> None:
     '''Configure and write an Protein DataBank File from an OpenMM Topology and array of positions
     Provides options to configure atom ID numbering, residue numbering, and residue naming'''
-    if resname_repl is None:
-        resname_repl = {} # avoids mutable default
+    if resname_map is None:
+        resname_map = {} # avoids mutable default
 
     # chain config
     for chain in topology.chains():
@@ -133,7 +133,7 @@ def serialize_openmm_pdb(pdb_path : Path, topology : OpenMMTopology, positions :
     # residue config
     for residue in topology.residues():
         residue.id = str(residue.id) # avoids TypeError when specifying keepIds during PDB write
-        repl_res_name = resname_repl.get(residue.name, None) # lookup current residue name to see if a replacement is called for
+        repl_res_name = resname_map.get(residue.name, None) # lookup current residue name to see if a replacement is called for
         if repl_res_name is not None:
             residue.name = repl_res_name
 
diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 54f119c..1b6fc27 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -52,10 +52,10 @@ def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tup
     return mb_compound, linker_ids
 
 @allow_string_paths
-def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int=2, resname_repl : dict[str, str]=None) -> None:
+def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int=2, resname_map : dict[str, str]=None) -> None:
     '''Save an MBuild Compound into an OpenMM-compatible PDB file'''
-    if resname_repl is None: # avoid mutable default
-        resname_repl = {'RES' : 'Pol'} 
+    if resname_map is None: # avoid mutable default
+        resname_map = {'RES' : 'Pol'} 
 
     traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format)
     omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory
@@ -66,7 +66,7 @@ def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int
         positions=omm_pos,
         uniquify_atom_ids=True,
         num_atom_id_digits=num_atom_digits,
-        resname_repl=resname_repl
+        resname_map=resname_map
     )
 
 

From 0bd435543ace8f52613a77500a53872f36d6ae62 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 4 Dec 2024 19:04:49 -0700
Subject: [PATCH 21/78] Implemented mBuild Compound to RDKit converter which
 preserves conformer and residue info

---
 polymerist/polymers/building.py | 47 +++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 1b6fc27..ba5042b 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -17,6 +17,7 @@
 
 from pathlib import Path
 from rdkit import Chem
+from collections import Counter
 
 from .exceptions import InsufficientChainLengthError, MorphologyError
 from .estimation import estimate_n_atoms_linear
@@ -68,6 +69,52 @@ def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int
         num_atom_id_digits=num_atom_digits,
         resname_map=resname_map
     )
+    
+# TODO: deduplify PDB atom anme and residue numbering code against serialize_openmm_pdb()
+def mbmol_to_rdmol(
+        mbmol : Compound,
+        uniquify_atom_ids : bool=False,
+        num_atom_id_digits : int=2,
+        resname_map : Optional[dict[str, str]]=None
+    ) -> Chem.Mol:
+    '''Convert an mBuild Compound into an RDKit Mol, with correct atom coordinates and PDB residue info'''
+    if resname_map is None:
+        resname_map = {}
+    
+    rdmol = mbmol.to_rdkit()
+    conformer = Chem.Conformer()
+    conformer.Set3D(True)
+
+    atom_id : int = 0
+    element_counter = Counter()
+    for resnum, mb_monomer in enumerate(mbmol.children, start=1):
+        resname = resname_map.get(mb_monomer.name, mb_monomer.name[:3]) # if no remapping is found, just take first 3 chars
+        # NOTE: the order of monomers and atoms within those monomers were added in the same order as iterated over here...
+        #... so the atom indices **SHOULD** be in the correct order (hate that this even might be uncertain)
+        for mbatom in mb_monomer.particles(): 
+            conformer.SetAtomPosition(atom_id, 10*mbatom.pos.astype(float)) # conveert from nm to angstrom
+
+            # set PDB residue info if monomer hierarchy is present
+            if mbatom != mb_monomer: # for Compounds with a flat hierarchy, the children and particles of children will coincide
+                symbol = mbatom.element.symbol
+                atom_ser_id = element_counter[symbol]
+                atom_ser_str = f'{atom_ser_id:0{num_atom_id_digits}d}' if uniquify_atom_ids else '  ' # double space keeps column justification correct when non-unique
+                atom_name = f' {symbol}{atom_ser_str}' # need a leading space to get column alignment in PDB compliant with spec
+                
+                pdb_info = Chem.AtomPDBResidueInfo(
+                    atomName=atom_name, 
+                    residueName=resname,
+                    residueNumber=resnum,
+                    chainId='1',
+                    isHeteroAtom=True,
+                )
+                element_counter[symbol] += 1 # only increment AFTER prior value has been assigned to the current atom
+                rdmol.GetAtomWithIdx(atom_id).SetPDBResidueInfo(pdb_info)
+            
+            atom_id += 1 # TODO: this is an awful waay of keeping track of atom indices, see if there's a more secure way to do this
+    conf_id = rdmol.AddConformer(conformer)
+    
+    return rdmol
 
 
 # LINEAR POLYMER BUILDING

From d94733608cf7cfbe30e583b5506655555c90ac25 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 12:49:47 -0700
Subject: [PATCH 22/78] Deprecated irrelevant custom Exceptions, pared down use
 of "Error" suffix on Exception names

---
 polymerist/polymers/estimation.py |  4 ++--
 polymerist/polymers/exceptions.py | 34 ++++++++-----------------------
 2 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/polymerist/polymers/estimation.py b/polymerist/polymers/estimation.py
index 37d2784..4dc3fa8 100644
--- a/polymerist/polymers/estimation.py
+++ b/polymerist/polymers/estimation.py
@@ -6,7 +6,7 @@
 import numpy as np
 from rdkit import Chem
 
-from .exceptions import InsufficientChainLengthError
+from .exceptions import InsufficientChainLength
 from ..genutils.iteration import iter_len
 from ..polymers.monomers.repr import MonomerGroup
 from ..rdutils.bonding.portlib import get_num_ports
@@ -45,7 +45,7 @@ def estimate_n_monomers_infimum(monomers : MonomerGroup, n_atoms_max : int, n_mo
     '''
     n_atoms_base = estimate_n_atoms_linear(monomers, n_monomers_min)
     if n_atoms_base > n_atoms_max: # pre-check when optimization is impossible
-        raise InsufficientChainLengthError(f'Even shortest possible chain ({n_monomers_min} monomers, with {n_atoms_base} atoms) is longer than the specified max length of {n_atoms_max} atoms')
+        raise InsufficientChainLength(f'Even shortest possible chain ({n_monomers_min} monomers, with {n_atoms_base} atoms) is longer than the specified max length of {n_atoms_max} atoms')
 
     n_monomers = n_monomers_min 
     while estimate_n_atoms_linear(monomers, n_monomers + 1) < n_atoms_max: # check if adding 1 more monomer keeps the length below the threshold
diff --git a/polymerist/polymers/exceptions.py b/polymerist/polymers/exceptions.py
index e7b5625..dd9e7af 100644
--- a/polymerist/polymers/exceptions.py
+++ b/polymerist/polymers/exceptions.py
@@ -4,50 +4,34 @@
 __email__ = 'timotej.bernat@colorado.edu'
 
 
-class SubstructMatchFailedError(Exception):
-    '''Raised when molecule graph isomorphism match does not form a cover'''
-    pass
-
-class InsufficientChainLengthError(Exception):
+class InsufficientChainLength(Exception):
     '''Raised when the polymer molecule being built is too short'''
     pass
 
-class ExcessiveChainLengthError(Exception):
+class ExcessiveChainLength(Exception):
     '''Raised when the polymer molecule being built is too long'''
     pass
 
+class PartialBlockSequence(Exception):
+    '''Raised when an non-whole number of copolymer blocks is needed to reach a target chain length (and is not allowed)'''
+    pass
+
 class MorphologyError(Exception):
     '''Raised when a polymer does not have the morphology (i.e. crosslinking, molecular weight, etc) an application expects'''
     pass
 
-class AlreadySolvatedError(Exception):
+class AlreadySolvated(Exception):
     '''Raised when attempting to add solvent to a molecule which already has solvent'''
     pass
 
-class ChargeMismatchError(Exception):
+class ChargeMismatch(Exception):
     '''Raised when attempting to merge two objects which disagree on their charging status'''
     pass
 
-class NoSimulationsFoundError(Exception):
-    '''Raised when attempting to load a simulation for a managed molecule when none are present'''
-    pass
-
 class MissingStructureData(Exception):
     '''Raised when a managed molecule has no associated structure file (e.g. PDB, SDF, etc.)'''
     pass
 
-class MissingForceFieldData(Exception):
-    '''Raised when a forcefield is unspecified for a Simulation or Interchange'''
-    pass
-
 class MissingMonomerData(Exception):
-    '''Raised when no monomer information is found for a Polymer'''
-    pass
-
-class MissingMonomerDataUncharged(MissingMonomerData):
-    '''Raised when no monomer information WITHOUT library charges is found for a Polymer'''
-    pass
-
-class MissingMonomerDataCharged(MissingMonomerData):
-    '''Raised when no monomer information WITH library charges is found for a Polymer'''
+    '''Raised when no monomer fragment information is found for a Polymer'''
     pass

From 7d4b5a147f8b800ea55558e604829d082c7f6ca0 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 14:12:12 -0700
Subject: [PATCH 23/78] Implemented support for fractional sequence repeats,
 with informative Exceptions for invalid inputs

---
 polymerist/polymers/building.py | 82 ++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index ba5042b..094e3d6 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -15,15 +15,17 @@
     from mbuild import Compound
     from mbuild.lib.recipes.polymer import Polymer as MBPolymer
 
+from fractions import Fraction
 from pathlib import Path
-from rdkit import Chem
 from collections import Counter
 
-from .exceptions import InsufficientChainLengthError, MorphologyError
+from rdkit import Chem
+
+from .exceptions import InsufficientChainLength, PartialBlockSequence, MorphologyError
 from .estimation import estimate_n_atoms_linear
 
 from ..genutils.decorators.functional import allow_string_paths
-from ..genutils.textual.substrings import unique_string
+from ..genutils.textual.substrings import unique_string, repeat_string_to_length
 
 from ..rdutils.bonding.portlib import get_linker_ids
 from ..rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports
@@ -53,7 +55,12 @@ def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tup
     return mb_compound, linker_ids
 
 @allow_string_paths
-def mbmol_to_openmm_pdb(pdb_path : Path, mbmol : Compound, num_atom_digits : int=2, resname_map : dict[str, str]=None) -> None:
+def mbmol_to_openmm_pdb(
+        pdb_path : Path,
+        mbmol : Compound, 
+        num_atom_digits : int=2,
+        resname_map : Optional[dict[str, str]]=None,
+    ) -> None:
     '''Save an MBuild Compound into an OpenMM-compatible PDB file'''
     if resname_map is None: # avoid mutable default
         resname_map = {'RES' : 'Pol'} 
@@ -116,53 +123,74 @@ def mbmol_to_rdmol(
     
     return rdmol
 
-
 # LINEAR POLYMER BUILDING
 def build_linear_polymer(
         monomers : MonomerGroup,
         n_monomers : int,
         sequence : str='A',
-        allow_partial_sequences : bool=True,
+        allow_partial_sequences : bool=False,
         add_Hs : bool=False,
         energy_minimize : bool=False,
     ) -> MBPolymer:
     '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON)
     and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
-    # 0) DETERMINE THE ORIENTATION AND NUMBER OF TERMINAL MONOMERS, SUPPLYING THIS IF AN INVALID DEFINITION IS PROVIDED
-    if monomers.has_valid_linear_term_orient: # DEV: consider moving this logic into MonomerGroup
+    # 0) DETERMINE THE ORIENTATION AND NUMBER OF TERMINAL MONOMERS, SUPPLYING THIS IF AN INVALID DEFINITION IS PROVIDED - DEV: consider moving this logic into MonomerGroup
+    if monomers.has_valid_linear_term_orient: 
         term_orient = monomers.term_orient
         LOGGER.info(f'Using pre-defined terminal group orientation {term_orient}')
     else:
         term_orient = {
             resname : orient
-                for (resname, rdmol), orient in zip(monomers.iter_rdmols(term_only=True), ['head', 'tail']) # will raise StopIteration if fewer
+                for (resname, rdmol), orient in zip(monomers.iter_rdmols(term_only=True), ['head', 'tail'])
         }
         LOGGER.warning(f'No valid terminal monomer orientations defined; autogenerated orientations "{term_orient}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
 
-    # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE)
-    n_mono_term = len(term_orient) # determine how many terminal monomers are actually present and well-defined
-    n_mono_middle = n_monomers - n_mono_term # in a linear chain, all monomers are either middle of terminal
+    # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE) - DEV: consider making a separate function
     block_size = len(sequence)
+    n_mono_term = len(term_orient)           # number of terminal monomers are actually present and well-defined
+    n_mono_middle = n_monomers - n_mono_term # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
+    if n_mono_middle < 0:
+        raise InsufficientChainLength(f'Registered number of terminal monomers exceeds requested chain length ({n_monomers}-mer chain can\'t possibly contain {n_mono_term} terminal monomers)')
+    
+    n_seq_whole : int     # number of full sequence repeats to reach a number of monomers less than or equal to the target
+    n_symbols_remaining : int # number of any remaining symbols in sequence (i.e. monomers) needed to close the gap to the target (allowed to be 0 if target is a multiple of the sequence length)
+    n_seq_whole, n_symbols_remaining = divmod(n_mono_middle, block_size) 
+    print(n_seq_whole, n_symbols_remaining)
+    
+    if n_symbols_remaining != 0: # a whole number of sequence repeats (including possibly 0) plus some fraction of a full block sequence
+        if not allow_partial_sequences:
+            raise PartialBlockSequence(
+                f'Partial polymer block sequence required to meet target number of monomers ("{sequence[:n_symbols_remaining]}" prefix of sequence "{sequence}"). ' \
+                'If this is acceptable, set "allow_partial_sequences=True" and try calling build routine again'
+            )    
+        sequence_selected = repeat_string_to_length(sequence, target_length=n_mono_middle, joiner='')
+        n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit)
+        LOGGER.warning(
+            f'Target number of monomers is achievable WITH a partial {n_symbols_remaining}/{block_size} sequence repeat; ' \
+            f'({n_seq_whole}*{block_size} [{sequence}] + {n_symbols_remaining} [{sequence[:n_symbols_remaining]}]) middle monomers + {n_mono_term} terminal monomers = {n_monomers} total monomers'
+        )
+    else: # for a purely-whole number of block sequence repeats
+        if n_seq_whole < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced by hand
+            raise InsufficientChainLength(
+                f'{n_monomers}-monomer chain cannot accomodate both {n_mono_term} end groups AND at least 1 middle monomer sequence'
+            )
+        sequence_selected = sequence # NOTE: rename here is for clarity, and for consistency with partial sequence case
+        n_seq_repeats = n_seq_whole
+        LOGGER.info(
+            f'Target chain length achievable with {n_seq_repeats} whole block(s) of the sequence "{sequence_selected}"; ' \
+            f'({n_seq_repeats}*{block_size} [{sequence_selected}]) middle monomers + {n_mono_term} terminal monomers = {n_monomers} total monomers'
+        )
+    print(sequence_selected, n_seq_repeats)
     
-    if (n_mono_middle % block_size) != 0:
-        raise ValueError(f'Cannot build a(n) {n_monomers}-monomer chain from any number of {block_size}-monomer blocks and {n_mono_term} end groups')
-    # NOTE: not explicitly forcing n_seq_reps to catch lingering float input / inexact division errors
-    n_seq_reps = n_mono_middle // block_size # number of times to repeat the block sequence between end groups to reach the target chain length
-    if n_seq_reps < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced by hand
-        raise InsufficientChainLengthError(f'{n_monomers}-monomer chain has few total monomers to accomodate {n_mono_term} end groups AND at least 1 middle monomer sequence')
-    # TODO: consider adding support for fractional sequence lengths IFF that fraction is a rational number whose denominator divides the sequence length...
-    # ...for example, could allow 5/2 * 'BACA' to be interpreted as 'BACA|BACA|BA'; 5/3 * 'BACA' would still be invalid though
-    LOGGER.info(f'Target chain length achievable with {n_seq_reps} block sequence repeat(s) ({n_seq_reps}*{block_size} [{sequence}] middle monomers + {n_mono_term} terminal monomers = {n_monomers} total monomers)')
-
     # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
     monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building
     ## 2A) ADD MIDDLE MONOMERS TO CHAIN
     chain = MBPolymer() 
-    for (resname, middle_monomer), sequence_key in zip(
+    for (resname, middle_monomer), symbol in zip(
             monomers.iter_rdmols(term_only=False),
-            unique_string(sequence, preserve_order=True), # only register a new monomer for each appearance of a new indicator in the sequence
+            unique_string(sequence_selected, preserve_order=True), # only register a new monomer for each appearance of a new indicator in the sequence
         ): # zip with sequence limits number of middle monomers to length of block sequence
-        LOGGER.info(f'Registering middle monomer {resname} (block identifier "{sequence_key}")')
+        LOGGER.info(f'Registering middle monomer {resname} (block identifier "{symbol}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname)
         chain.add_monomer(compound=mb_monomer, indices=linker_ids)
         monomers_selected.monomers[resname] = monomers.monomers[resname]
@@ -173,7 +201,7 @@ def build_linear_polymer(
             for resname, rdmol_list in monomers.rdmols(term_only=True).items() 
     }
     for resname, head_or_tail in term_orient.items():
-        term_monomer = next(term_iters[resname])
+        term_monomer = next(term_iters[resname]) # will raise StopIteration if the terminal monomer in question is empty
         LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer, resname=resname)
         chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
@@ -185,7 +213,7 @@ def build_linear_polymer(
     
     n_atoms_est = estimate_n_atoms_linear(monomers_selected, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
     LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)')
-    chain.build(n_seq_reps, sequence=sequence, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
+    chain.build(n_seq_repeats, sequence=sequence_selected, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
     for atom in chain.particles():
         atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings)
     LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {chain.n_particles} atoms)')

From 3797c64158d776fb9a43eb4c2ce73a7ec1f38d06 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 15:09:21 -0700
Subject: [PATCH 24/78] Added new custom Exception for end-group dominated
 chains

---
 polymerist/polymers/exceptions.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/polymerist/polymers/exceptions.py b/polymerist/polymers/exceptions.py
index dd9e7af..b7502cb 100644
--- a/polymerist/polymers/exceptions.py
+++ b/polymerist/polymers/exceptions.py
@@ -12,6 +12,9 @@ class ExcessiveChainLength(Exception):
     '''Raised when the polymer molecule being built is too long'''
     pass
 
+class EndGroupDominatedChain(Exception):
+    '''Raised to indicate there are more end groups present in a chain than are monomer possibly allowed'''
+
 class PartialBlockSequence(Exception):
     '''Raised when an non-whole number of copolymer blocks is needed to reach a target chain length (and is not allowed)'''
     pass

From 8c96e48128a0399d2bc0f8dd5169afb21cef8d32 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 15:43:48 -0700
Subject: [PATCH 25/78] Separated procrustean sequence determination into
 dedicated helper function

---
 polymerist/polymers/building.py | 135 ++++++++++++++++++++++----------
 1 file changed, 95 insertions(+), 40 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 094e3d6..4ef796c 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -21,7 +21,7 @@
 
 from rdkit import Chem
 
-from .exceptions import InsufficientChainLength, PartialBlockSequence, MorphologyError
+from .exceptions import EndGroupDominatedChain, InsufficientChainLength, PartialBlockSequence, MorphologyError
 from .estimation import estimate_n_atoms_linear
 
 from ..genutils.decorators.functional import allow_string_paths
@@ -124,35 +124,62 @@ def mbmol_to_rdmol(
     return rdmol
 
 # LINEAR POLYMER BUILDING
-def build_linear_polymer(
-        monomers : MonomerGroup,
-        n_monomers : int,
-        sequence : str='A',
-        allow_partial_sequences : bool=False,
-        add_Hs : bool=False,
-        energy_minimize : bool=False,
-    ) -> MBPolymer:
-    '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON)
-    and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
-    # 0) DETERMINE THE ORIENTATION AND NUMBER OF TERMINAL MONOMERS, SUPPLYING THIS IF AN INVALID DEFINITION IS PROVIDED - DEV: consider moving this logic into MonomerGroup
-    if monomers.has_valid_linear_term_orient: 
-        term_orient = monomers.term_orient
-        LOGGER.info(f'Using pre-defined terminal group orientation {term_orient}')
-    else:
-        term_orient = {
-            resname : orient
-                for (resname, rdmol), orient in zip(monomers.iter_rdmols(term_only=True), ['head', 'tail'])
-        }
-        LOGGER.warning(f'No valid terminal monomer orientations defined; autogenerated orientations "{term_orient}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
+def procrustean_polymer_sequence_alignment(
+        sequence : str,
+        n_monomers_target : int,
+        n_monomers_terminal : int,
+        allow_partial_sequences : bool=False
+    ) -> tuple[str, int]:
+    '''
+    For a given polymer block sequence "S", target linear chain length, and number of terminal monomers,
+    Returns a sequence "P" and number of repeats "r" which, taken together, satisfy the following:
+    - The number of monomers in r repeats of P plus the number of terminal monomers is precisely equal to the target number of monomers
+    - The symbols in sequence P cycle through the symbols in S, in the order they appear in S
+    - The number of times S is cycles through in P is always a rational multiple of the length of S
+    If no satisfiable sequence-count pair can be found, raises an appropriate informative exception
+    
+    Named to reflect the fact that the original sequence S will be stretched or truncated to fit the given target sequence length
+    
+    Parameters
+    ----------
+    sequence : str
+        A sequence indicating a periodic ordering of monomers in a linear polymer block (e.g. "A", "ABAC", etc)
+        Each unique symbol in the sequence corresponds to a distinct monomer in the block
+    n_monomers_target : int
+        The desired number of monomers (including terminal monomers) in a polymer chain
+    n_monomers_terminal : int
+        The number of terminal monomers ("end groups") which are to be included in the chain
+        in addition to the middle monomers described by "sequence"
+    allow_partial_sequences : bool, default False
+        Whether to allow fractional repeats of the original sequence in order to meet the target number of monomers
+        
+        For example, to construct a 12-mer chain with 2 end groups from the sequence "BACA", one would require 10 middle monomers
+        which can only be achieved with 2.5 (10/4) sequence repeats, namely as "BACA|BACA|BA"; 
 
-    # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE) - DEV: consider making a separate function
+        This behavior may or may not be desired, depending on the use case, and can be controlled by this flag
+    
+    Returns
+    -------
+    sequence_procrustean : str
+        A possibly modified version of the original polymer block sequence
+    n_seq_repeats : int
+        The number of times "sequence_procrustean" must be repeated to achieve the target sequence length
+    
+    Raises
+    ------
+    End GroupDominatedChain
+        The number of terminal monomers exceed the number of total monomers
+    PartialBlockSequence
+        If a partial sequence repeat is required but disallowed (by setting allow_partial_sequences=False)
+    InsufficientChainLength
+        If 
+    '''
     block_size = len(sequence)
-    n_mono_term = len(term_orient)           # number of terminal monomers are actually present and well-defined
-    n_mono_middle = n_monomers - n_mono_term # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
+    n_mono_middle = n_monomers_target - n_monomers_terminal # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
     if n_mono_middle < 0:
-        raise InsufficientChainLength(f'Registered number of terminal monomers exceeds requested chain length ({n_monomers}-mer chain can\'t possibly contain {n_mono_term} terminal monomers)')
+        raise EndGroupDominatedChain(f'Registered number of terminal monomers exceeds requested chain length ({n_monomers_target}-mer chain can\'t possibly contain {n_monomers_terminal} terminal monomers)')
     
-    n_seq_whole : int     # number of full sequence repeats to reach a number of monomers less than or equal to the target
+    n_seq_whole : int         # number of full sequence repeats to reach a number of monomers less than or equal to the target
     n_symbols_remaining : int # number of any remaining symbols in sequence (i.e. monomers) needed to close the gap to the target (allowed to be 0 if target is a multiple of the sequence length)
     n_seq_whole, n_symbols_remaining = divmod(n_mono_middle, block_size) 
     print(n_seq_whole, n_symbols_remaining)
@@ -163,33 +190,61 @@ def build_linear_polymer(
                 f'Partial polymer block sequence required to meet target number of monomers ("{sequence[:n_symbols_remaining]}" prefix of sequence "{sequence}"). ' \
                 'If this is acceptable, set "allow_partial_sequences=True" and try calling build routine again'
             )    
-        sequence_selected = repeat_string_to_length(sequence, target_length=n_mono_middle, joiner='')
+        sequence_procrustean = repeat_string_to_length(sequence, target_length=n_mono_middle, joiner='')
         n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit)
         LOGGER.warning(
             f'Target number of monomers is achievable WITH a partial {n_symbols_remaining}/{block_size} sequence repeat; ' \
-            f'({n_seq_whole}*{block_size} [{sequence}] + {n_symbols_remaining} [{sequence[:n_symbols_remaining]}]) middle monomers + {n_mono_term} terminal monomers = {n_monomers} total monomers'
+            f'({n_seq_whole}*{block_size} [{sequence}] + {n_symbols_remaining} [{sequence[:n_symbols_remaining]}]) middle monomers + {n_monomers_terminal} terminal monomers = {n_monomers} total monomers'
         )
     else: # for a purely-whole number of block sequence repeats
-        if n_seq_whole < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced by hand
+        if n_seq_whole < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced my hand
             raise InsufficientChainLength(
-                f'{n_monomers}-monomer chain cannot accomodate both {n_mono_term} end groups AND at least 1 middle monomer sequence'
+                f'{n_monomers_target}-monomer chain cannot accomodate both {n_monomers_terminal} end groups AND at least 1 middle monomer sequence'
             )
-        sequence_selected = sequence # NOTE: rename here is for clarity, and for consistency with partial sequence case
+        sequence_procrustean = sequence # NOTE: rename here is for clarity, and for consistency with partial sequence case
         n_seq_repeats = n_seq_whole
         LOGGER.info(
-            f'Target chain length achievable with {n_seq_repeats} whole block(s) of the sequence "{sequence_selected}"; ' \
-            f'({n_seq_repeats}*{block_size} [{sequence_selected}]) middle monomers + {n_mono_term} terminal monomers = {n_monomers} total monomers'
+            f'Target chain length achievable with {n_seq_repeats} whole block(s) of the sequence "{sequence_procrustean}"; ' \
+            f'({n_seq_repeats}*{block_size} [{sequence_procrustean}]) middle monomers + {n_monomers_terminal} terminal monomers = {n_monomers_target} total monomers'
         )
-    print(sequence_selected, n_seq_repeats)
+    return sequence_procrustean, n_seq_repeats
+
+
+def build_linear_polymer(
+        monomers : MonomerGroup,
+        n_monomers : int,
+        sequence : str='A',
+        allow_partial_sequences : bool=False,
+        add_Hs : bool=False,
+        energy_minimize : bool=False,
+    ) -> MBPolymer:
+    '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON)
+    and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
+    # 0) DETERMINE THE ORIENTATION AND NUMBER OF TERMINAL MONOMERS, SUPPLYING THIS IF AN INVALID DEFINITION IS PROVIDED - DEV: consider moving this logic into MonomerGroup
+    if monomers.has_valid_linear_term_orient: 
+        term_orient = monomers.term_orient
+        LOGGER.info(f'Using pre-defined terminal group orientation {term_orient}')
+    else:
+        term_orient = {
+            orient : resname
+                for (resname, rdmol), orient in zip(monomers.iter_rdmols(term_only=True), ['head', 'tail'])
+        }
+        LOGGER.warning(f'No valid terminal monomer orientations defined; autogenerated orientations "{term_orient}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
+
+    # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE) - DEV: consider making a separate function
+    sequence_compliant, n_seq_repeats = procrustean_polymer_sequence_alignment(
+        sequence,
+        n_monomers_target=n_monomers,
+        n_monomers_terminal=len(term_orient), # number of terminal monomers are actually present and well-defined
+        allow_partial_sequences=allow_partial_sequences,
+    )
+    sequence_unique = unique_string(sequence_compliant, preserve_order=True) # only register a new monomer for each appearance of a new, unique symbol in the sequence
     
     # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
     monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building
     ## 2A) ADD MIDDLE MONOMERS TO CHAIN
     chain = MBPolymer() 
-    for (resname, middle_monomer), symbol in zip(
-            monomers.iter_rdmols(term_only=False),
-            unique_string(sequence_selected, preserve_order=True), # only register a new monomer for each appearance of a new indicator in the sequence
-        ): # zip with sequence limits number of middle monomers to length of block sequence
+    for (resname, middle_monomer), symbol in zip(monomers.iter_rdmols(term_only=False), sequence_unique): # zip with sequence limits number of middle monomers to length of block sequence
         LOGGER.info(f'Registering middle monomer {resname} (block identifier "{symbol}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname)
         chain.add_monomer(compound=mb_monomer, indices=linker_ids)
@@ -200,7 +255,7 @@ def build_linear_polymer(
         resname : iter(rdmol_list)   # made necessary by annoying list-bound structure of current substructure spec
             for resname, rdmol_list in monomers.rdmols(term_only=True).items() 
     }
-    for resname, head_or_tail in term_orient.items():
+    for head_or_tail, resname in term_orient.items():
         term_monomer = next(term_iters[resname]) # will raise StopIteration if the terminal monomer in question is empty
         LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer, resname=resname)
@@ -213,7 +268,7 @@ def build_linear_polymer(
     
     n_atoms_est = estimate_n_atoms_linear(monomers_selected, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
     LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)')
-    chain.build(n_seq_repeats, sequence=sequence_selected, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
+    chain.build(n_seq_repeats, sequence=sequence_compliant, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
     for atom in chain.particles():
         atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings)
     LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {chain.n_particles} atoms)')

From f7422bb15054918ecaf1c90bd6a5f5d8267e3a27 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 15:44:40 -0700
Subject: [PATCH 26/78] Switched order of residue name and head/tail identifier
 in MonomerGroup.term_orient (head/tail is now key, and residue name is value)

---
 polymerist/polymers/monomers/repr.py   | 8 ++++----
 polymerist/tests/data/peg-pla-pga.json | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index f8ebf62..a3c50ba 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -23,7 +23,7 @@
 class MonomerGroup:
     '''Stores collections of residue-labelled monomer SMARTS'''
     monomers : ResidueSmarts = field(default_factory=dict)
-    term_orient : dict[str, str] = field(default_factory=dict)
+    term_orient : dict[str, str] = field(default_factory=dict) # keys are either "head" or "tail", values are the names of residues in "monomers"
 
     @staticmethod
     def is_terminal(monomer : Mol) -> bool:
@@ -87,9 +87,9 @@ def _is_valid(self) -> bool:
     def has_valid_linear_term_orient(self) -> bool:
         '''Check whether terminal group orientations are sufficient to define a linear polymer'''
         return (
-            bool(self.term_orient)                                       # check that: 1) the term group orientations are non-empty / non-null...
-            and all(resname in self.monomers for resname in self.term_orient.keys()) # 2) all term group keys match a present monomer...
-            and sorted(self.term_orient.values()) == ['head', 'tail']                # 3) orientation labels are only "head" and "tail" (in either order)
+            bool(self.term_orient)                                         # check that: 1) term group orientations are non-empty...
+            and set(self.term_orient.keys()) == {'head', 'tail'}                       # 2) ...orientation labels are only "head" and "tail" (in any order)...
+            and all(resname in self.monomers for resname in self.term_orient.values()) # 3) ... and all term group keys match a present monomer
         )
     
     # COMPOSITION AND I/O METHODS
diff --git a/polymerist/tests/data/peg-pla-pga.json b/polymerist/tests/data/peg-pla-pga.json
index 2db06af..a865ed5 100644
--- a/polymerist/tests/data/peg-pla-pga.json
+++ b/polymerist/tests/data/peg-pla-pga.json
@@ -31,8 +31,8 @@
             ]
         },
         "term_orient": {
-            "PEG-1A": "head",
-            "PEG_1B": "tail"
+            "head": "PEG-1A",
+            "tail": "PEG_1B"
         }
     }
 }
\ No newline at end of file

From e2be34df7d04bdf4fc254dba01617e917354aab1 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 16:33:40 -0700
Subject: [PATCH 27/78] Added __post_init__ check for listification of bare
 SMARTS and for SMARTS string validity

---
 polymerist/polymers/monomers/repr.py | 55 ++++++++++++++++++----------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index a3c50ba..b74944c 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -12,19 +12,36 @@
 
 from ...genutils.iteration import iter_len
 from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable
-from ...rdutils.bonding.portlib import get_num_ports
 
+from ...smileslib.primitives import Smarts, is_valid_SMARTS
+from ...rdutils.bonding.portlib import get_num_ports
 
-ResidueSmarts : TypeAlias = dict[str, list[str]] # monomer SMARTS strings keyed by residue name
 
 # MAIN REPRESENTATION CLASS
 @make_jsonifiable
 @dataclass
 class MonomerGroup:
     '''Stores collections of residue-labelled monomer SMARTS'''
-    monomers : ResidueSmarts = field(default_factory=dict)
+    monomers : dict[str, Union[Smarts, list[Smarts]]] = field(default_factory=dict)
     term_orient : dict[str, str] = field(default_factory=dict) # keys are either "head" or "tail", values are the names of residues in "monomers"
 
+    def __post_init__(self) -> None:
+        # Encase bare SMARTS into lists and check that all monomer SMARTS are valid
+        for resname, smarts_seq in self.monomers.items():
+            if isinstance(smarts_seq, list):
+                smarts_list = smarts_seq # no modification needed
+            elif isinstance(smarts_seq, str):
+                smarts_list = [smarts_seq] # wrap lone SMARTS string in list
+                self.monomers[resname] = smarts_list # update value internally (doesn't change size of dict)
+            else:
+                raise TypeError(f'Values of monomers must be either SMARTS strings or lists of SMARTS strings, not "{type(smarts_seq).__name__}"')
+            
+            # check that all SMARTS are valid
+            for i, smarts in enumerate(smarts_list): # we can now be sure that this is a list of SMARTS strings
+                if not is_valid_SMARTS(smarts):
+                    raise ValueError(f'Provided invalid monomer SMARTS string for {resname}[{i}]: "{smarts}"')               
+        # DEV: opted to forgo term_orient check for now, as modifying this violates the read-only data model aimed for here
+                
     @staticmethod
     def is_terminal(monomer : Mol) -> bool:
         '''Determine whether or not a monomer is terminal'''
@@ -32,7 +49,7 @@ def is_terminal(monomer : Mol) -> bool:
 
     # ATTRIBUTE PROPERTIES AND ALIASES
     @property
-    def SMARTS(self) -> ResidueSmarts:
+    def SMARTS(self) -> dict[str, list[Smarts]]:
         '''Alias of legacy "monomers" attribute'''
         return self.monomers # alias of legacy name for convenience
     
@@ -69,42 +86,40 @@ def rdmols(self, term_only : Optional[bool]=None) -> dict[str, list[Mol]]:
     
     @property
     def n_monomers(self) -> int:
-        '''Returns number of present monomers
-        Multiple monomers with the same residue name are considered distinct'''
+        '''Returns number of present monomers; multiple monomers under the same residue name are considered distinct'''
         return iter_len(self.iter_rdmols(term_only=None))
     
-    # VALIDATION AND PROPERTY CHECKS
-    @property
-    def _is_valid(self) -> bool:
-        '''Check that types and formatting are correct'''
-        for resname, SMARTS_list in self.monomers.items():
-            if not (isinstance(resname, str) and isinstance(SMARTS_list, list)):
-                return False
-        else:
-            return True # valid only if none of the SMARTS lists fail
-        
+    # END GROUP DETERMINATION      
     @property
-    def has_valid_linear_term_orient(self) -> bool:
+    def _has_valid_linear_term_orient(self) -> bool:
         '''Check whether terminal group orientations are sufficient to define a linear polymer'''
         return (
             bool(self.term_orient)                                         # check that: 1) term group orientations are non-empty...
             and set(self.term_orient.keys()) == {'head', 'tail'}                       # 2) ...orientation labels are only "head" and "tail" (in any order)...
             and all(resname in self.monomers for resname in self.term_orient.values()) # 3) ... and all term group keys match a present monomer
         )
+        
+    @property
+    def linear_end_groups(self) -> dict[str, Mol]:
+        '''
+        Returns head-and-tail end groups as defined by term_orient
+        If term orient is undefined, will 
+        '''
+        ...
     
-    # COMPOSITION AND I/O METHODS
+    # COMPOSITION METHODS
     def __add__(self, other : 'MonomerGroup') -> 'MonomerGroup':
         '''Content-aware method of merging multiple sets of monomer info via the addition operator'''
         cls = self.__class__
         if not isinstance(other, cls):
             raise NotImplementedError(f'Can only merge {cls.__name__} with another {cls.__name__}, not object of type {type(other)}')
-
+        # TODO: figure out how to handle combination of term group orientation gracefully (ignoring for now)
         return MonomerGroup(monomers={**self.monomers, **other.monomers})
 
     __radd__ = __add__ # support reverse addition
 
     # CHEMICAL INFORMATION
-    def unique(self, cap_group : Union[str, Mol]=Chem.MolFromSmarts('[H]-[*]')) -> 'MonomerGroup':
+    def unique(self, cap_group : Union[Smarts, Mol]=Chem.MolFromSmarts('[H]-[*]')) -> 'MonomerGroup':
         '''Return a MonomerGroup containing only the unique monomers present, given a particular port saturating group (by default just a hydrogen)'''
         raise NotImplementedError
         # unique_mono = set()

From 56b4144f39ab415e7014404d970f996896c4a4e1 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 16:37:26 -0700
Subject: [PATCH 28/78] Added module-level logger

---
 polymerist/polymers/monomers/repr.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index b74944c..8ffde83 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -3,6 +3,9 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
+import logging
+LOGGER = logging.getLogger(__name__)
+
 from typing import Generator, Optional, TypeAlias, Union
 from dataclasses import dataclass, field
 
@@ -31,6 +34,7 @@ def __post_init__(self) -> None:
             if isinstance(smarts_seq, list):
                 smarts_list = smarts_seq # no modification needed
             elif isinstance(smarts_seq, str):
+                LOGGER.warning(f'Wrapping bare monomer SMARTS in list to comply with spec (storing as ["{smarts_seq}"])')
                 smarts_list = [smarts_seq] # wrap lone SMARTS string in list
                 self.monomers[resname] = smarts_list # update value internally (doesn't change size of dict)
             else:

From d852ed34123214a892fde69131b59e05a8e3d95b Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 16:54:13 -0700
Subject: [PATCH 29/78] Added internal method for producing end groups for
 linear polymer building (with dynamic fallback)

---
 polymerist/polymers/monomers/repr.py | 42 ++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index 8ffde83..e36ec5d 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -6,12 +6,13 @@
 import logging
 LOGGER = logging.getLogger(__name__)
 
-from typing import Generator, Optional, TypeAlias, Union
+from typing import Generator, Optional, Union
 from dataclasses import dataclass, field
 
+from itertools import cycle
 from collections import defaultdict
+
 from rdkit import Chem
-from rdkit.Chem.rdchem import Mol
 
 from ...genutils.iteration import iter_len
 from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable
@@ -32,6 +33,8 @@ def __post_init__(self) -> None:
         # Encase bare SMARTS into lists and check that all monomer SMARTS are valid
         for resname, smarts_seq in self.monomers.items():
             if isinstance(smarts_seq, list):
+                if not smarts_seq:
+                    raise IndexError(f'Empty monomer declaration for "{resname}"') # catch case where empty list if provided (would slip through subsequent checks otherwise)
                 smarts_list = smarts_seq # no modification needed
             elif isinstance(smarts_seq, str):
                 LOGGER.warning(f'Wrapping bare monomer SMARTS in list to comply with spec (storing as ["{smarts_seq}"])')
@@ -47,7 +50,7 @@ def __post_init__(self) -> None:
         # DEV: opted to forgo term_orient check for now, as modifying this violates the read-only data model aimed for here
                 
     @staticmethod
-    def is_terminal(monomer : Mol) -> bool:
+    def is_terminal(monomer : Chem.Mol) -> bool:
         '''Determine whether or not a monomer is terminal'''
         return get_num_ports(monomer) == 1
 
@@ -57,7 +60,7 @@ def SMARTS(self) -> dict[str, list[Smarts]]:
         '''Alias of legacy "monomers" attribute'''
         return self.monomers # alias of legacy name for convenience
     
-    def iter_rdmols(self, term_only : Optional[bool]=None) -> Generator[tuple[str, Mol], None, None]:
+    def iter_rdmols(self, term_only : Optional[bool]=None) -> Generator[tuple[str, Chem.Mol], None, None]:
         '''
         Generate (residue name, RDKit Mol) pairs of all monomers present
         Simplifies iteration over internal lists of monomer Mols
@@ -73,7 +76,7 @@ def iter_rdmols(self, term_only : Optional[bool]=None) -> Generator[tuple[str, M
                 if (term_only is None) or (MonomerGroup.is_terminal(monomer) == term_only):
                     yield (resname, monomer)
 
-    def rdmols(self, term_only : Optional[bool]=None) -> dict[str, list[Mol]]:
+    def rdmols(self, term_only : Optional[bool]=None) -> dict[str, list[Chem.Mol]]:
         '''
         Returns dict of RDKit Mol lists keyed by residue name
 
@@ -104,12 +107,33 @@ def _has_valid_linear_term_orient(self) -> bool:
         )
         
     @property
-    def linear_end_groups(self) -> dict[str, Mol]:
+    def linear_end_groups(self) -> dict[str, Chem.Mol]:
         '''
         Returns head-and-tail end groups as defined by term_orient
-        If term orient is undefined, will 
+        
+        If term orient is undefined, will automatically take then first 
+        <= 2 terminal groups available to be the end groups
         '''
-        ...
+        if self._has_valid_linear_term_orient: 
+            LOGGER.info(f'Using user-defined terminal group orientation {self.term_orient}')
+            monomer_iters = {
+                resname : cycle(smarts_list) 
+                    for resname, smarts_list in self.rdmols(term_only=True).items()
+            } # cycle handles degenerate end group case correctly
+            
+            return {
+                head_or_tail : next(monomer_iters[resname])
+                    for head_or_tail, resname in self.term_orient.items()
+            }
+        else:
+            term_orient_auto : dict[str, Smarts] = {}
+            end_groups_auto  : dict[str, Chem.Mol] = {}
+            for head_or_tail, (resname, rdmol) in zip(['head', 'tail'], self.iter_rdmols(term_only=True)): # zip will bottom out early if fewer than 2 terminal monomers are present
+                term_orient_auto[head_or_tail] = resname
+                end_groups_auto[head_or_tail]  = rdmol
+            LOGGER.warning(f'No valid terminal monomer orientations defined; auto-assigned orientations "{term_orient_auto}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
+                
+            return end_groups_auto
     
     # COMPOSITION METHODS
     def __add__(self, other : 'MonomerGroup') -> 'MonomerGroup':
@@ -123,7 +147,7 @@ def __add__(self, other : 'MonomerGroup') -> 'MonomerGroup':
     __radd__ = __add__ # support reverse addition
 
     # CHEMICAL INFORMATION
-    def unique(self, cap_group : Union[Smarts, Mol]=Chem.MolFromSmarts('[H]-[*]')) -> 'MonomerGroup':
+    def unique(self, cap_group : Union[Smarts, Chem.Mol]=Chem.MolFromSmarts('[H]-[*]')) -> 'MonomerGroup':
         '''Return a MonomerGroup containing only the unique monomers present, given a particular port saturating group (by default just a hydrogen)'''
         raise NotImplementedError
         # unique_mono = set()

From 9903dd1b689f6849f6d72a9e8369202b034ca077 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 16:58:00 -0700
Subject: [PATCH 30/78] Changed MonomerGroup.linear_end_groups from property to
 vanilla method to emphasize that calculation being done is non-trivial

---
 polymerist/polymers/monomers/repr.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index e36ec5d..fa98e76 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -106,7 +106,6 @@ def _has_valid_linear_term_orient(self) -> bool:
             and all(resname in self.monomers for resname in self.term_orient.values()) # 3) ... and all term group keys match a present monomer
         )
         
-    @property
     def linear_end_groups(self) -> dict[str, Chem.Mol]:
         '''
         Returns head-and-tail end groups as defined by term_orient

From 70ee78797955cad6e1ace1c8d06030e031d0b0f4 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 17:16:35 -0700
Subject: [PATCH 31/78] Deprecated _has_valid_linear_term_orient, included
 residue name in linear_end_groups() output

---
 polymerist/polymers/monomers/repr.py | 29 +++++++++++++---------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index fa98e76..174bd80 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -96,24 +96,21 @@ def n_monomers(self) -> int:
         '''Returns number of present monomers; multiple monomers under the same residue name are considered distinct'''
         return iter_len(self.iter_rdmols(term_only=None))
     
-    # END GROUP DETERMINATION      
-    @property
-    def _has_valid_linear_term_orient(self) -> bool:
-        '''Check whether terminal group orientations are sufficient to define a linear polymer'''
-        return (
-            bool(self.term_orient)                                         # check that: 1) term group orientations are non-empty...
-            and set(self.term_orient.keys()) == {'head', 'tail'}                       # 2) ...orientation labels are only "head" and "tail" (in any order)...
-            and all(resname in self.monomers for resname in self.term_orient.values()) # 3) ... and all term group keys match a present monomer
-        )
-        
-    def linear_end_groups(self) -> dict[str, Chem.Mol]:
+    # END GROUP DETERMINATION 
+    def linear_end_groups(self) -> dict[str, tuple[str, Chem.Mol]]:
         '''
-        Returns head-and-tail end groups as defined by term_orient
+        Returns head-and-tail end group residue names and Mol objects as defined by term_orient
         
         If term orient is undefined, will automatically take then first 
         <= 2 terminal groups available to be the end groups
+        
+        Returns
+        -------
+        end_groups : dict[str, tuple[str, Chem.Mol]]
+            A dict whose keys are any of {'head', 'tail'} and whose
+            values are 2-tuples of residue names and Mols for the corresponding monomer
         '''
-        if self._has_valid_linear_term_orient: 
+        if self.term_orient and set(self.term_orient.keys()) == {'head', 'tail'}:
             LOGGER.info(f'Using user-defined terminal group orientation {self.term_orient}')
             monomer_iters = {
                 resname : cycle(smarts_list) 
@@ -121,15 +118,15 @@ def linear_end_groups(self) -> dict[str, Chem.Mol]:
             } # cycle handles degenerate end group case correctly
             
             return {
-                head_or_tail : next(monomer_iters[resname])
+                head_or_tail : (resname, next(monomer_iters[resname])) # will raise KeyError if any of the resnames are not present
                     for head_or_tail, resname in self.term_orient.items()
             }
         else:
             term_orient_auto : dict[str, Smarts] = {}
             end_groups_auto  : dict[str, Chem.Mol] = {}
             for head_or_tail, (resname, rdmol) in zip(['head', 'tail'], self.iter_rdmols(term_only=True)): # zip will bottom out early if fewer than 2 terminal monomers are present
-                term_orient_auto[head_or_tail] = resname
-                end_groups_auto[head_or_tail]  = rdmol
+                term_orient_auto[head_or_tail] = resname # populate purely for logging
+                end_groups_auto[head_or_tail]  = (resname, rdmol)
             LOGGER.warning(f'No valid terminal monomer orientations defined; auto-assigned orientations "{term_orient_auto}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
                 
             return end_groups_auto

From df12d48774be007b781c3e5a7de9adcfc68d84e5 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 17:49:56 -0700
Subject: [PATCH 32/78] Deferred end group determination to internal
 implemenation in MonomerGroup

---
 polymerist/polymers/building.py | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 4ef796c..279c427 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -172,7 +172,7 @@ def procrustean_polymer_sequence_alignment(
     PartialBlockSequence
         If a partial sequence repeat is required but disallowed (by setting allow_partial_sequences=False)
     InsufficientChainLength
-        If 
+        If the target number of monomers results in no middle monomers being included (i.e. neither full NOR partial sequence repeats)
     '''
     block_size = len(sequence)
     n_mono_middle = n_monomers_target - n_monomers_terminal # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
@@ -209,7 +209,6 @@ def procrustean_polymer_sequence_alignment(
         )
     return sequence_procrustean, n_seq_repeats
 
-
 def build_linear_polymer(
         monomers : MonomerGroup,
         n_monomers : int,
@@ -220,30 +219,21 @@ def build_linear_polymer(
     ) -> MBPolymer:
     '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON)
     and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
-    # 0) DETERMINE THE ORIENTATION AND NUMBER OF TERMINAL MONOMERS, SUPPLYING THIS IF AN INVALID DEFINITION IS PROVIDED - DEV: consider moving this logic into MonomerGroup
-    if monomers.has_valid_linear_term_orient: 
-        term_orient = monomers.term_orient
-        LOGGER.info(f'Using pre-defined terminal group orientation {term_orient}')
-    else:
-        term_orient = {
-            orient : resname
-                for (resname, rdmol), orient in zip(monomers.iter_rdmols(term_only=True), ['head', 'tail'])
-        }
-        LOGGER.warning(f'No valid terminal monomer orientations defined; autogenerated orientations "{term_orient}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
-
     # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE) - DEV: consider making a separate function
+    end_groups = monomers.linear_end_groups() # cache end groups so they dont need to be recalculated when registering end groups
     sequence_compliant, n_seq_repeats = procrustean_polymer_sequence_alignment(
         sequence,
         n_monomers_target=n_monomers,
-        n_monomers_terminal=len(term_orient), # number of terminal monomers are actually present and well-defined
+        n_monomers_terminal=len(end_groups), # number of terminal monomers are actually present and well-defined
         allow_partial_sequences=allow_partial_sequences,
     )
     sequence_unique = unique_string(sequence_compliant, preserve_order=True) # only register a new monomer for each appearance of a new, unique symbol in the sequence
     
     # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
+    chain = MBPolymer() 
     monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building
+    
     ## 2A) ADD MIDDLE MONOMERS TO CHAIN
-    chain = MBPolymer() 
     for (resname, middle_monomer), symbol in zip(monomers.iter_rdmols(term_only=False), sequence_unique): # zip with sequence limits number of middle monomers to length of block sequence
         LOGGER.info(f'Registering middle monomer {resname} (block identifier "{symbol}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname)
@@ -251,12 +241,7 @@ def build_linear_polymer(
         monomers_selected.monomers[resname] = monomers.monomers[resname]
 
     ## 2B) ADD TERMINAL MONOMERS TO CHAIN
-    term_iters = { # need to convert to iterators to allow for generator-like advancement (required for term group selection to behave as expected)
-        resname : iter(rdmol_list)   # made necessary by annoying list-bound structure of current substructure spec
-            for resname, rdmol_list in monomers.rdmols(term_only=True).items() 
-    }
-    for head_or_tail, resname in term_orient.items():
-        term_monomer = next(term_iters[resname]) # will raise StopIteration if the terminal monomer in question is empty
+    for head_or_tail, (resname, term_monomer) in end_groups.items():
         LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer, resname=resname)
         chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
@@ -268,11 +253,13 @@ def build_linear_polymer(
     
     n_atoms_est = estimate_n_atoms_linear(monomers_selected, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
     LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)')
+    
     chain.build(n_seq_repeats, sequence=sequence_compliant, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
     for atom in chain.particles():
         atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings)
     LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {chain.n_particles} atoms)')
     
+    # 4) OPTIONALLY, PERFORM FINAL UFF ENERGY MINIMIZATION
     if energy_minimize:
         LOGGER.info('Energy-minimizing chain to find more stable conformer')
         chain.energy_minimize()

From f1e6925d52f298b1e0c1d59e10d02ba45362d274 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 20:33:24 -0700
Subject: [PATCH 33/78] Enhanced logging of sequence breakdown, unified logging
 between whole and partial cases

---
 polymerist/polymers/building.py | 39 ++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
index 279c427..6d0cc94 100644
--- a/polymerist/polymers/building.py
+++ b/polymerist/polymers/building.py
@@ -174,6 +174,7 @@ def procrustean_polymer_sequence_alignment(
     InsufficientChainLength
         If the target number of monomers results in no middle monomers being included (i.e. neither full NOR partial sequence repeats)
     '''
+    # Evaluate sizes of missing components from given values
     block_size = len(sequence)
     n_mono_middle = n_monomers_target - n_monomers_terminal # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
     if n_mono_middle < 0:
@@ -182,8 +183,8 @@ def procrustean_polymer_sequence_alignment(
     n_seq_whole : int         # number of full sequence repeats to reach a number of monomers less than or equal to the target
     n_symbols_remaining : int # number of any remaining symbols in sequence (i.e. monomers) needed to close the gap to the target (allowed to be 0 if target is a multiple of the sequence length)
     n_seq_whole, n_symbols_remaining = divmod(n_mono_middle, block_size) 
-    print(n_seq_whole, n_symbols_remaining)
-    
+
+    # Break down into cases by whether or not a whole number of sequence repeats is possible
     if n_symbols_remaining != 0: # a whole number of sequence repeats (including possibly 0) plus some fraction of a full block sequence
         if not allow_partial_sequences:
             raise PartialBlockSequence(
@@ -192,10 +193,6 @@ def procrustean_polymer_sequence_alignment(
             )    
         sequence_procrustean = repeat_string_to_length(sequence, target_length=n_mono_middle, joiner='')
         n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit)
-        LOGGER.warning(
-            f'Target number of monomers is achievable WITH a partial {n_symbols_remaining}/{block_size} sequence repeat; ' \
-            f'({n_seq_whole}*{block_size} [{sequence}] + {n_symbols_remaining} [{sequence[:n_symbols_remaining]}]) middle monomers + {n_monomers_terminal} terminal monomers = {n_monomers} total monomers'
-        )
     else: # for a purely-whole number of block sequence repeats
         if n_seq_whole < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced my hand
             raise InsufficientChainLength(
@@ -203,10 +200,32 @@ def procrustean_polymer_sequence_alignment(
             )
         sequence_procrustean = sequence # NOTE: rename here is for clarity, and for consistency with partial sequence case
         n_seq_repeats = n_seq_whole
-        LOGGER.info(
-            f'Target chain length achievable with {n_seq_repeats} whole block(s) of the sequence "{sequence_procrustean}"; ' \
-            f'({n_seq_repeats}*{block_size} [{sequence_procrustean}]) middle monomers + {n_monomers_terminal} terminal monomers = {n_monomers_target} total monomers'
-        )
+        
+    # Generate descriptive log message to summarize sequence modifications
+    ## Determine info present for whole and partial sections
+    desc_seq_counts_parts = []
+    desc_seq_order_middle = []
+    
+    if n_seq_whole != 0: ## Whole sequence strings
+        desc_seq_counts_parts.append(f'{n_seq_whole} whole {block_size}-sequence repeats')
+        desc_seq_order_middle.append(f'{n_seq_whole}*[{sequence}]')
+        
+    if n_symbols_remaining != 0: ## Partial sequence strings
+        desc_seq_counts_parts.append(f'a partial {n_symbols_remaining}/{block_size} sequence repeat')
+        desc_seq_order_middle.append(f'[{sequence[:n_symbols_remaining]}]')
+        
+    ## Finalizing sequence counts descriptor parts
+    tally_str = f'({n_seq_whole}*{block_size} + {n_symbols_remaining}) middle monomers + {n_monomers_terminal} terminal monomers = {n_monomers_target} total monomers)'
+    if len(desc_seq_counts_parts) == 2:
+        desc_seq_counts_parts.insert(1, ' and ') # include conjunction if a mixed (i.e. both whole and fractional) solution was found
+    
+    ## Finalizing sequence order descriptor parts
+    desc_seq_order_parts = ['[END-GROUP]']*n_monomers_terminal # abut with correct amount of end group indicators
+    desc_seq_order_parts[1:-1] = desc_seq_order_middle # insert middle sections for whole and partial sequences
+    
+    ## putting everything together
+    LOGGER.info(f'Target chain length achievable with {"".join(desc_seq_counts_parts)};\n Namely, polymer will be sequenced as {" + ".join(desc_seq_order_parts)}, yielding {tally_str}')
+        
     return sequence_procrustean, n_seq_repeats
 
 def build_linear_polymer(

From dc053afab32b03864c8a79e8e34beeec098eac3c Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 21:44:41 -0700
Subject: [PATCH 34/78] Added custom Exception for missing package dependency
 which reduces error message boilerplate

---
 .../genutils/importutils/dependencies.py      | 20 ++++++++++++++++++-
 polymerist/mdtools/openfftools/__init__.py    | 13 ++++++------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/polymerist/genutils/importutils/dependencies.py b/polymerist/genutils/importutils/dependencies.py
index 4787afe..79fa48a 100644
--- a/polymerist/genutils/importutils/dependencies.py
+++ b/polymerist/genutils/importutils/dependencies.py
@@ -3,7 +3,7 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
-from typing import Callable, ParamSpec, TypeVar
+from typing import Callable, Optional, ParamSpec, TypeVar
 
 Params = ParamSpec('Params')
 ReturnType = TypeVar('ReturnType')
@@ -14,6 +14,24 @@
 from functools import wraps
 
 
+class MissingPrerequisitePackage(Exception):
+    '''Raised when a package dependency cannot be found and the user should be alerted with install instructions'''
+    def __init__(self,
+            importing_package_name : str,
+            use_case : str,
+            install_link : str,
+            dependency_name : str,
+            dependency_name_formal : Optional[str]=None
+        ):
+        if dependency_name_formal is None:
+            dependency_name_formal = dependency_name
+        
+        message = f'''
+        {use_case.capitalize()} require(s) {dependency_name_formal}, which was not found in the current environment
+        Please install `{dependency_name}` by following the installation instructions at {install_link}; then try importing from "{importing_package_name}" again'''
+        
+        super().__init__(message)
+        
 def module_installed(module_name : str) -> bool:
     '''
     Check whether a module of the given name is present on the system
diff --git a/polymerist/mdtools/openfftools/__init__.py b/polymerist/mdtools/openfftools/__init__.py
index 8f1ab65..f0c5f59 100644
--- a/polymerist/mdtools/openfftools/__init__.py
+++ b/polymerist/mdtools/openfftools/__init__.py
@@ -4,13 +4,14 @@
 __email__ = 'timotej.bernat@colorado.edu'
 
 # Subpackage-wide precheck to see if OpenFF is even usable in the first place
-from ...genutils.importutils.dependencies import modules_installed
+from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage
 if not modules_installed('openff', 'openff.toolkit'):
-    raise ModuleNotFoundError(
-        f'''
-        OpenFF packages which are required to utilitize {__name__} not found in current environment
-        Please follow installation instructions at https://docs.openforcefield.org/projects/toolkit/en/stable/installation.html, then retry import
-        '''
+    raise MissingPrerequisitePackage(
+        importing_package_name=__spec__.name,
+        use_case='OpenFF addons',
+        install_link='https://docs.openforcefield.org/projects/toolkit/en/stable/installation.html',
+        dependency_name='openff-toolkit',
+        dependency_name_formal='the OpenFF software stack',
     )
     
 # Import of toplevel OpenFF object registries

From d4f636100e20a60af76019a43bb57a87d8e52b88 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 21:44:49 -0700
Subject: [PATCH 35/78] Deleted superfluous imports

---
 polymerist/polymers/estimation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/polymerist/polymers/estimation.py b/polymerist/polymers/estimation.py
index 4dc3fa8..b075888 100644
--- a/polymerist/polymers/estimation.py
+++ b/polymerist/polymers/estimation.py
@@ -4,10 +4,8 @@
 __email__ = 'timotej.bernat@colorado.edu'
 
 import numpy as np
-from rdkit import Chem
 
 from .exceptions import InsufficientChainLength
-from ..genutils.iteration import iter_len
 from ..polymers.monomers.repr import MonomerGroup
 from ..rdutils.bonding.portlib import get_num_ports
 

From 88182e7961febc618359c7f578dd67b1cdccaf13 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 21:48:23 -0700
Subject: [PATCH 36/78] Converted polymers.building into a package, split up
 functionality among child modules

---
 polymerist/polymers/building.py            | 287 ---------------------
 polymerist/polymers/building/__init__.py   |  18 ++
 polymerist/polymers/building/linear.py     |  80 ++++++
 polymerist/polymers/building/mbconvert.py  | 126 +++++++++
 polymerist/polymers/building/sequencing.py | 115 +++++++++
 5 files changed, 339 insertions(+), 287 deletions(-)
 delete mode 100644 polymerist/polymers/building.py
 create mode 100644 polymerist/polymers/building/__init__.py
 create mode 100644 polymerist/polymers/building/linear.py
 create mode 100644 polymerist/polymers/building/mbconvert.py
 create mode 100644 polymerist/polymers/building/sequencing.py

diff --git a/polymerist/polymers/building.py b/polymerist/polymers/building.py
deleted file mode 100644
index 6d0cc94..0000000
--- a/polymerist/polymers/building.py
+++ /dev/null
@@ -1,287 +0,0 @@
-'''Utilities for building new polymer structures; currently limited to linear polymers and PDB save format'''
-
-__author__ = 'Timotej Bernat'
-__email__ = 'timotej.bernat@colorado.edu'
-
-import logging
-LOGGER = logging.getLogger(__name__)
-
-from typing import Optional
-
-import warnings
-with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings
-    warnings.filterwarnings('ignore',  category=DeprecationWarning)
-    import mbuild as mb
-    from mbuild import Compound
-    from mbuild.lib.recipes.polymer import Polymer as MBPolymer
-
-from fractions import Fraction
-from pathlib import Path
-from collections import Counter
-
-from rdkit import Chem
-
-from .exceptions import EndGroupDominatedChain, InsufficientChainLength, PartialBlockSequence, MorphologyError
-from .estimation import estimate_n_atoms_linear
-
-from ..genutils.decorators.functional import allow_string_paths
-from ..genutils.textual.substrings import unique_string, repeat_string_to_length
-
-from ..rdutils.bonding.portlib import get_linker_ids
-from ..rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports
-
-from ..mdtools.openmmtools.serialization import serialize_openmm_pdb
-from ..polymers.monomers.repr import MonomerGroup
-from ..polymers.monomers.specification import SANITIZE_AS_KEKULE
-
-
-# CONVERSION
-def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tuple[Compound, list[int]]:
-    '''
-    Accepts a monomer-spec-compliant SMARTS string and returns an mbuild Compound and a list of the indices of atom ports
-    If "resname" is provided, will assign that name to the mBuild Compound returned
-    '''
-    linker_ids = [i for i in get_linker_ids(rdmol)] # record indices of ports - MUST unpack generator for mbuild compatibility
-    
-    # create port-free version of molecule which RDKit can embed without errors
-    prot_mol = hydrogenate_rdmol_ports(rdmol, in_place=False)
-    # prot_mol = saturate_ports(rdmol) # TOSELF : custom, port-based saturation methods are not yet ready for deployment - yield issues in RDKit representation under-the-hood 
-    Chem.SanitizeMol(prot_mol, sanitizeOps=SANITIZE_AS_KEKULE) # ensure Mol is valid (avoids implicitValence issues)
-    
-    mb_compound = mb.conversion.from_rdkit(prot_mol) # native from_rdkit() method actually appears to preserve atom ordering
-    if resname is not None:
-        mb_compound.name = resname
-
-    return mb_compound, linker_ids
-
-@allow_string_paths
-def mbmol_to_openmm_pdb(
-        pdb_path : Path,
-        mbmol : Compound, 
-        num_atom_digits : int=2,
-        resname_map : Optional[dict[str, str]]=None,
-    ) -> None:
-    '''Save an MBuild Compound into an OpenMM-compatible PDB file'''
-    if resname_map is None: # avoid mutable default
-        resname_map = {'RES' : 'Pol'} 
-
-    traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format)
-    omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory
-
-    serialize_openmm_pdb(
-        pdb_path,
-        topology=omm_top,
-        positions=omm_pos,
-        uniquify_atom_ids=True,
-        num_atom_id_digits=num_atom_digits,
-        resname_map=resname_map
-    )
-    
-# TODO: deduplify PDB atom anme and residue numbering code against serialize_openmm_pdb()
-def mbmol_to_rdmol(
-        mbmol : Compound,
-        uniquify_atom_ids : bool=False,
-        num_atom_id_digits : int=2,
-        resname_map : Optional[dict[str, str]]=None
-    ) -> Chem.Mol:
-    '''Convert an mBuild Compound into an RDKit Mol, with correct atom coordinates and PDB residue info'''
-    if resname_map is None:
-        resname_map = {}
-    
-    rdmol = mbmol.to_rdkit()
-    conformer = Chem.Conformer()
-    conformer.Set3D(True)
-
-    atom_id : int = 0
-    element_counter = Counter()
-    for resnum, mb_monomer in enumerate(mbmol.children, start=1):
-        resname = resname_map.get(mb_monomer.name, mb_monomer.name[:3]) # if no remapping is found, just take first 3 chars
-        # NOTE: the order of monomers and atoms within those monomers were added in the same order as iterated over here...
-        #... so the atom indices **SHOULD** be in the correct order (hate that this even might be uncertain)
-        for mbatom in mb_monomer.particles(): 
-            conformer.SetAtomPosition(atom_id, 10*mbatom.pos.astype(float)) # conveert from nm to angstrom
-
-            # set PDB residue info if monomer hierarchy is present
-            if mbatom != mb_monomer: # for Compounds with a flat hierarchy, the children and particles of children will coincide
-                symbol = mbatom.element.symbol
-                atom_ser_id = element_counter[symbol]
-                atom_ser_str = f'{atom_ser_id:0{num_atom_id_digits}d}' if uniquify_atom_ids else '  ' # double space keeps column justification correct when non-unique
-                atom_name = f' {symbol}{atom_ser_str}' # need a leading space to get column alignment in PDB compliant with spec
-                
-                pdb_info = Chem.AtomPDBResidueInfo(
-                    atomName=atom_name, 
-                    residueName=resname,
-                    residueNumber=resnum,
-                    chainId='1',
-                    isHeteroAtom=True,
-                )
-                element_counter[symbol] += 1 # only increment AFTER prior value has been assigned to the current atom
-                rdmol.GetAtomWithIdx(atom_id).SetPDBResidueInfo(pdb_info)
-            
-            atom_id += 1 # TODO: this is an awful waay of keeping track of atom indices, see if there's a more secure way to do this
-    conf_id = rdmol.AddConformer(conformer)
-    
-    return rdmol
-
-# LINEAR POLYMER BUILDING
-def procrustean_polymer_sequence_alignment(
-        sequence : str,
-        n_monomers_target : int,
-        n_monomers_terminal : int,
-        allow_partial_sequences : bool=False
-    ) -> tuple[str, int]:
-    '''
-    For a given polymer block sequence "S", target linear chain length, and number of terminal monomers,
-    Returns a sequence "P" and number of repeats "r" which, taken together, satisfy the following:
-    - The number of monomers in r repeats of P plus the number of terminal monomers is precisely equal to the target number of monomers
-    - The symbols in sequence P cycle through the symbols in S, in the order they appear in S
-    - The number of times S is cycles through in P is always a rational multiple of the length of S
-    If no satisfiable sequence-count pair can be found, raises an appropriate informative exception
-    
-    Named to reflect the fact that the original sequence S will be stretched or truncated to fit the given target sequence length
-    
-    Parameters
-    ----------
-    sequence : str
-        A sequence indicating a periodic ordering of monomers in a linear polymer block (e.g. "A", "ABAC", etc)
-        Each unique symbol in the sequence corresponds to a distinct monomer in the block
-    n_monomers_target : int
-        The desired number of monomers (including terminal monomers) in a polymer chain
-    n_monomers_terminal : int
-        The number of terminal monomers ("end groups") which are to be included in the chain
-        in addition to the middle monomers described by "sequence"
-    allow_partial_sequences : bool, default False
-        Whether to allow fractional repeats of the original sequence in order to meet the target number of monomers
-        
-        For example, to construct a 12-mer chain with 2 end groups from the sequence "BACA", one would require 10 middle monomers
-        which can only be achieved with 2.5 (10/4) sequence repeats, namely as "BACA|BACA|BA"; 
-
-        This behavior may or may not be desired, depending on the use case, and can be controlled by this flag
-    
-    Returns
-    -------
-    sequence_procrustean : str
-        A possibly modified version of the original polymer block sequence
-    n_seq_repeats : int
-        The number of times "sequence_procrustean" must be repeated to achieve the target sequence length
-    
-    Raises
-    ------
-    End GroupDominatedChain
-        The number of terminal monomers exceed the number of total monomers
-    PartialBlockSequence
-        If a partial sequence repeat is required but disallowed (by setting allow_partial_sequences=False)
-    InsufficientChainLength
-        If the target number of monomers results in no middle monomers being included (i.e. neither full NOR partial sequence repeats)
-    '''
-    # Evaluate sizes of missing components from given values
-    block_size = len(sequence)
-    n_mono_middle = n_monomers_target - n_monomers_terminal # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
-    if n_mono_middle < 0:
-        raise EndGroupDominatedChain(f'Registered number of terminal monomers exceeds requested chain length ({n_monomers_target}-mer chain can\'t possibly contain {n_monomers_terminal} terminal monomers)')
-    
-    n_seq_whole : int         # number of full sequence repeats to reach a number of monomers less than or equal to the target
-    n_symbols_remaining : int # number of any remaining symbols in sequence (i.e. monomers) needed to close the gap to the target (allowed to be 0 if target is a multiple of the sequence length)
-    n_seq_whole, n_symbols_remaining = divmod(n_mono_middle, block_size) 
-
-    # Break down into cases by whether or not a whole number of sequence repeats is possible
-    if n_symbols_remaining != 0: # a whole number of sequence repeats (including possibly 0) plus some fraction of a full block sequence
-        if not allow_partial_sequences:
-            raise PartialBlockSequence(
-                f'Partial polymer block sequence required to meet target number of monomers ("{sequence[:n_symbols_remaining]}" prefix of sequence "{sequence}"). ' \
-                'If this is acceptable, set "allow_partial_sequences=True" and try calling build routine again'
-            )    
-        sequence_procrustean = repeat_string_to_length(sequence, target_length=n_mono_middle, joiner='')
-        n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit)
-    else: # for a purely-whole number of block sequence repeats
-        if n_seq_whole < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced my hand
-            raise InsufficientChainLength(
-                f'{n_monomers_target}-monomer chain cannot accomodate both {n_monomers_terminal} end groups AND at least 1 middle monomer sequence'
-            )
-        sequence_procrustean = sequence # NOTE: rename here is for clarity, and for consistency with partial sequence case
-        n_seq_repeats = n_seq_whole
-        
-    # Generate descriptive log message to summarize sequence modifications
-    ## Determine info present for whole and partial sections
-    desc_seq_counts_parts = []
-    desc_seq_order_middle = []
-    
-    if n_seq_whole != 0: ## Whole sequence strings
-        desc_seq_counts_parts.append(f'{n_seq_whole} whole {block_size}-sequence repeats')
-        desc_seq_order_middle.append(f'{n_seq_whole}*[{sequence}]')
-        
-    if n_symbols_remaining != 0: ## Partial sequence strings
-        desc_seq_counts_parts.append(f'a partial {n_symbols_remaining}/{block_size} sequence repeat')
-        desc_seq_order_middle.append(f'[{sequence[:n_symbols_remaining]}]')
-        
-    ## Finalizing sequence counts descriptor parts
-    tally_str = f'({n_seq_whole}*{block_size} + {n_symbols_remaining}) middle monomers + {n_monomers_terminal} terminal monomers = {n_monomers_target} total monomers)'
-    if len(desc_seq_counts_parts) == 2:
-        desc_seq_counts_parts.insert(1, ' and ') # include conjunction if a mixed (i.e. both whole and fractional) solution was found
-    
-    ## Finalizing sequence order descriptor parts
-    desc_seq_order_parts = ['[END-GROUP]']*n_monomers_terminal # abut with correct amount of end group indicators
-    desc_seq_order_parts[1:-1] = desc_seq_order_middle # insert middle sections for whole and partial sequences
-    
-    ## putting everything together
-    LOGGER.info(f'Target chain length achievable with {"".join(desc_seq_counts_parts)};\n Namely, polymer will be sequenced as {" + ".join(desc_seq_order_parts)}, yielding {tally_str}')
-        
-    return sequence_procrustean, n_seq_repeats
-
-def build_linear_polymer(
-        monomers : MonomerGroup,
-        n_monomers : int,
-        sequence : str='A',
-        allow_partial_sequences : bool=False,
-        add_Hs : bool=False,
-        energy_minimize : bool=False,
-    ) -> MBPolymer:
-    '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON)
-    and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
-    # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE) - DEV: consider making a separate function
-    end_groups = monomers.linear_end_groups() # cache end groups so they dont need to be recalculated when registering end groups
-    sequence_compliant, n_seq_repeats = procrustean_polymer_sequence_alignment(
-        sequence,
-        n_monomers_target=n_monomers,
-        n_monomers_terminal=len(end_groups), # number of terminal monomers are actually present and well-defined
-        allow_partial_sequences=allow_partial_sequences,
-    )
-    sequence_unique = unique_string(sequence_compliant, preserve_order=True) # only register a new monomer for each appearance of a new, unique symbol in the sequence
-    
-    # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
-    chain = MBPolymer() 
-    monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building
-    
-    ## 2A) ADD MIDDLE MONOMERS TO CHAIN
-    for (resname, middle_monomer), symbol in zip(monomers.iter_rdmols(term_only=False), sequence_unique): # zip with sequence limits number of middle monomers to length of block sequence
-        LOGGER.info(f'Registering middle monomer {resname} (block identifier "{symbol}")')
-        mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname)
-        chain.add_monomer(compound=mb_monomer, indices=linker_ids)
-        monomers_selected.monomers[resname] = monomers.monomers[resname]
-
-    ## 2B) ADD TERMINAL MONOMERS TO CHAIN
-    for head_or_tail, (resname, term_monomer) in end_groups.items():
-        LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")')
-        mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer, resname=resname)
-        chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
-        monomers_selected.monomers[resname] = monomers.monomers[resname]
-
-    # 3) ASSEMBLE AND RETURN CHAIN
-    if not monomers_selected.is_linear: # verify the selected monomers actually define a linear polymer
-        raise MorphologyError('Linear polymer building does not support non-linear monomer input')
-    
-    n_atoms_est = estimate_n_atoms_linear(monomers_selected, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
-    LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)')
-    
-    chain.build(n_seq_repeats, sequence=sequence_compliant, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
-    for atom in chain.particles():
-        atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings)
-    LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {chain.n_particles} atoms)')
-    
-    # 4) OPTIONALLY, PERFORM FINAL UFF ENERGY MINIMIZATION
-    if energy_minimize:
-        LOGGER.info('Energy-minimizing chain to find more stable conformer')
-        chain.energy_minimize()
-        LOGGER.info('Energy minimization completed')
-
-    return chain
\ No newline at end of file
diff --git a/polymerist/polymers/building/__init__.py b/polymerist/polymers/building/__init__.py
new file mode 100644
index 0000000..f2b636a
--- /dev/null
+++ b/polymerist/polymers/building/__init__.py
@@ -0,0 +1,18 @@
+'''
+Tools for building polymer conformers out of monomer SMARTS fragments
+Currently restricted to building linear homopolymers and periodic block copolymers
+'''
+
+from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage
+
+if not modules_installed('mbuild'):
+    MissingPrerequisitePackage(
+        importing_package_name=__spec__.name,
+        use_case='Polymer building',
+        install_link='https://mbuild.mosdef.org/en/stable/getting_started/installation/installation.html',
+        dependency_name='mbuild',
+        dependency_name_formal='mBuild',
+    )
+    
+from .linear import build_linear_polymer
+from .mbconvert import mbmol_to_openmm_pdb, mbmol_from_mono_rdmol, mbmol_to_rdmol
\ No newline at end of file
diff --git a/polymerist/polymers/building/linear.py b/polymerist/polymers/building/linear.py
new file mode 100644
index 0000000..aa726da
--- /dev/null
+++ b/polymerist/polymers/building/linear.py
@@ -0,0 +1,80 @@
+'''For generating linear polymer structure from monomer, sequence, and chain length information'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+import logging
+LOGGER = logging.getLogger(__name__)
+
+import warnings
+with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings
+    warnings.filterwarnings('ignore',  category=DeprecationWarning)
+    import mbuild as mb
+    from mbuild import Compound
+    from mbuild.lib.recipes.polymer import Polymer as MBPolymer
+
+from .mbconvert import mbmol_from_mono_rdmol
+from .sequencing import procrustean_polymer_sequence_alignment
+from ..exceptions import MorphologyError
+from ..monomers.repr import MonomerGroup
+from ..estimation import estimate_n_atoms_linear
+from ...genutils.textual.substrings import unique_string
+
+
+def build_linear_polymer(
+        monomers : MonomerGroup,
+        n_monomers : int,
+        sequence : str='A',
+        allow_partial_sequences : bool=False,
+        add_Hs : bool=False,
+        energy_minimize : bool=False,
+    ) -> MBPolymer:
+    '''Accepts a dict of monomer residue names and SMARTS (as one might find in a monomer JSON)
+    and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
+    # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE) - DEV: consider making a separate function
+    end_groups = monomers.linear_end_groups() # cache end groups so they dont need to be recalculated when registering end groups
+    sequence_compliant, n_seq_repeats = procrustean_polymer_sequence_alignment(
+        sequence,
+        n_monomers_target=n_monomers,
+        n_monomers_terminal=len(end_groups), # number of terminal monomers are actually present and well-defined
+        allow_partial_sequences=allow_partial_sequences,
+    )
+    sequence_unique = unique_string(sequence_compliant, preserve_order=True) # only register a new monomer for each appearance of a new, unique symbol in the sequence
+    
+    # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
+    chain = MBPolymer() 
+    monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building
+    
+    ## 2A) ADD MIDDLE MONOMERS TO CHAIN
+    for (resname, middle_monomer), symbol in zip(monomers.iter_rdmols(term_only=False), sequence_unique): # zip with sequence limits number of middle monomers to length of block sequence
+        LOGGER.info(f'Registering middle monomer {resname} (block identifier "{symbol}")')
+        mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname)
+        chain.add_monomer(compound=mb_monomer, indices=linker_ids)
+        monomers_selected.monomers[resname] = monomers.monomers[resname]
+
+    ## 2B) ADD TERMINAL MONOMERS TO CHAIN
+    for head_or_tail, (resname, term_monomer) in end_groups.items():
+        LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")')
+        mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer, resname=resname)
+        chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
+        monomers_selected.monomers[resname] = monomers.monomers[resname]
+
+    # 3) ASSEMBLE AND RETURN CHAIN
+    if not monomers_selected.is_linear: # verify the selected monomers actually define a linear polymer
+        raise MorphologyError('Linear polymer building does not support non-linear monomer input')
+    
+    n_atoms_est = estimate_n_atoms_linear(monomers_selected, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
+    LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)')
+    
+    chain.build(n_seq_repeats, sequence=sequence_compliant, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
+    for atom in chain.particles():
+        atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings)
+    LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {chain.n_particles} atoms)')
+    
+    # 4) OPTIONALLY, PERFORM FINAL UFF ENERGY MINIMIZATION
+    if energy_minimize:
+        LOGGER.info('Energy-minimizing chain to find more stable conformer')
+        chain.energy_minimize()
+        LOGGER.info('Energy minimization completed')
+
+    return chain
\ No newline at end of file
diff --git a/polymerist/polymers/building/mbconvert.py b/polymerist/polymers/building/mbconvert.py
new file mode 100644
index 0000000..7fc3075
--- /dev/null
+++ b/polymerist/polymers/building/mbconvert.py
@@ -0,0 +1,126 @@
+'''
+Enhanced conversions to and from mbuild Compound objects which
+preserve more molecular information than the utilities provided by 
+'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage
+
+if not modules_installed('mbuild'):
+    MissingPrerequisitePackage(
+        importing_package_name=__spec__.name,
+        use_case='Translation between chemical representations of polymers',
+        install_link='https://libraries.io/conda/openbabel',
+        dependency_name='openbabel',
+        dependency_name_formal='the OpenBabel chemical toolbox',
+    )
+
+from typing import Optional
+
+from pathlib import Path
+from collections import Counter
+
+from rdkit import Chem
+
+import warnings
+with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings
+    warnings.filterwarnings('ignore',  category=DeprecationWarning)
+    from mbuild import Compound
+    from mbuild.conversion import from_rdkit
+    
+from ..monomers.specification import SANITIZE_AS_KEKULE
+from ...genutils.decorators.functional import allow_string_paths
+from ...rdutils.bonding.portlib import get_linker_ids
+from ...rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports
+from ...mdtools.openmmtools.serialization import serialize_openmm_pdb
+
+
+
+def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tuple[Compound, list[int]]:
+    '''
+    Accepts a monomer-spec-compliant SMARTS string and returns an mbuild Compound and a list of the indices of atom ports
+    If "resname" is provided, will assign that name to the mBuild Compound returned
+    '''
+    linker_ids = [i for i in get_linker_ids(rdmol)] # record indices of ports - MUST unpack generator for mbuild compatibility
+    
+    # create port-free version of molecule which RDKit can embed without errors
+    prot_mol = hydrogenate_rdmol_ports(rdmol, in_place=False)
+    # prot_mol = saturate_ports(rdmol) # TOSELF : custom, port-based saturation methods are not yet ready for deployment - yield issues in RDKit representation under-the-hood 
+    Chem.SanitizeMol(prot_mol, sanitizeOps=SANITIZE_AS_KEKULE) # ensure Mol is valid (avoids implicitValence issues)
+    
+    mb_compound = from_rdkit(prot_mol) # native from_rdkit() method actually appears to preserve atom ordering
+    if resname is not None:
+        mb_compound.name = resname
+
+    return mb_compound, linker_ids
+
+@allow_string_paths
+def mbmol_to_openmm_pdb(
+        pdb_path : Path,
+        mbmol : Compound, 
+        num_atom_digits : int=2,
+        resname_map : Optional[dict[str, str]]=None,
+    ) -> None:
+    '''Save an MBuild Compound into an OpenMM-compatible PDB file'''
+    if resname_map is None: # avoid mutable default
+        resname_map = {'RES' : 'Pol'} 
+
+    traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format)
+    omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory
+
+    serialize_openmm_pdb(
+        pdb_path,
+        topology=omm_top,
+        positions=omm_pos,
+        uniquify_atom_ids=True,
+        num_atom_id_digits=num_atom_digits,
+        resname_map=resname_map
+    )
+    
+# TODO: deduplify PDB atom anme and residue numbering code against serialize_openmm_pdb()
+def mbmol_to_rdmol(
+        mbmol : Compound,
+        uniquify_atom_ids : bool=False,
+        num_atom_id_digits : int=2,
+        resname_map : Optional[dict[str, str]]=None
+    ) -> Chem.Mol:
+    '''Convert an mBuild Compound into an RDKit Mol, with correct atom coordinates and PDB residue info'''
+    if resname_map is None:
+        resname_map = {}
+    
+    rdmol = mbmol.to_rdkit()
+    conformer = Chem.Conformer()
+    conformer.Set3D(True)
+
+    atom_id : int = 0
+    element_counter = Counter()
+    for resnum, mb_monomer in enumerate(mbmol.children, start=1):
+        resname = resname_map.get(mb_monomer.name, mb_monomer.name[:3]) # if no remapping is found, just take first 3 chars
+        # NOTE: the order of monomers and atoms within those monomers were added in the same order as iterated over here...
+        #... so the atom indices **SHOULD** be in the correct order (hate that this even might be uncertain)
+        for mbatom in mb_monomer.particles(): 
+            conformer.SetAtomPosition(atom_id, 10*mbatom.pos.astype(float)) # conveert from nm to angstrom
+
+            # set PDB residue info if monomer hierarchy is present
+            if mbatom != mb_monomer: # for Compounds with a flat hierarchy, the children and particles of children will coincide
+                symbol = mbatom.element.symbol
+                atom_ser_id = element_counter[symbol]
+                atom_ser_str = f'{atom_ser_id:0{num_atom_id_digits}d}' if uniquify_atom_ids else '  ' # double space keeps column justification correct when non-unique
+                atom_name = f' {symbol}{atom_ser_str}' # need a leading space to get column alignment in PDB compliant with spec
+                
+                pdb_info = Chem.AtomPDBResidueInfo(
+                    atomName=atom_name, 
+                    residueName=resname,
+                    residueNumber=resnum,
+                    chainId='1',
+                    isHeteroAtom=True,
+                )
+                element_counter[symbol] += 1 # only increment AFTER prior value has been assigned to the current atom
+                rdmol.GetAtomWithIdx(atom_id).SetPDBResidueInfo(pdb_info)
+            
+            atom_id += 1 # TODO: this is an awful waay of keeping track of atom indices, see if there's a more secure way to do this
+    conf_id = rdmol.AddConformer(conformer)
+    
+    return rdmol
\ No newline at end of file
diff --git a/polymerist/polymers/building/sequencing.py b/polymerist/polymers/building/sequencing.py
new file mode 100644
index 0000000..de94ee3
--- /dev/null
+++ b/polymerist/polymers/building/sequencing.py
@@ -0,0 +1,115 @@
+'''For generating and manipulating sequences of symbols which correspond to monomer ordering in blocky and random copolymers'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+import logging
+LOGGER = logging.getLogger(__name__)
+
+from ..exceptions import EndGroupDominatedChain, InsufficientChainLength, PartialBlockSequence
+from ...genutils.textual.substrings import repeat_string_to_length
+
+
+def procrustean_polymer_sequence_alignment(
+        sequence : str,
+        n_monomers_target : int,
+        n_monomers_terminal : int,
+        allow_partial_sequences : bool=False
+    ) -> tuple[str, int]:
+    '''
+    For a given polymer block sequence "S", target linear chain length, and number of terminal monomers,
+    Returns a sequence "P" and number of repeats "r" which, taken together, satisfy the following:
+    - The number of monomers in r repeats of P plus the number of terminal monomers is precisely equal to the target number of monomers
+    - The symbols in sequence P cycle through the symbols in S, in the order they appear in S
+    - The number of times S is cycles through in P is always a rational multiple of the length of S
+    If no satisfiable sequence-count pair can be found, raises an appropriate informative exception
+    
+    Named to reflect the fact that the original sequence S will be stretched or truncated to fit the given target sequence length
+    
+    Parameters
+    ----------
+    sequence : str
+        A sequence indicating a periodic ordering of monomers in a linear polymer block (e.g. "A", "ABAC", etc)
+        Each unique symbol in the sequence corresponds to a distinct monomer in the block
+    n_monomers_target : int
+        The desired number of monomers (including terminal monomers) in a polymer chain
+    n_monomers_terminal : int
+        The number of terminal monomers ("end groups") which are to be included in the chain
+        in addition to the middle monomers described by "sequence"
+    allow_partial_sequences : bool, default False
+        Whether to allow fractional repeats of the original sequence in order to meet the target number of monomers
+        
+        For example, to construct a 12-mer chain with 2 end groups from the sequence "BACA", one would require 10 middle monomers
+        which can only be achieved with 2.5 (10/4) sequence repeats, namely as "BACA|BACA|BA"; 
+
+        This behavior may or may not be desired, depending on the use case, and can be controlled by this flag
+    
+    Returns
+    -------
+    sequence_procrustean : str
+        A possibly modified version of the original polymer block sequence
+    n_seq_repeats : int
+        The number of times "sequence_procrustean" must be repeated to achieve the target sequence length
+    
+    Raises
+    ------
+    End GroupDominatedChain
+        The number of terminal monomers exceed the number of total monomers
+    PartialBlockSequence
+        If a partial sequence repeat is required but disallowed (by setting allow_partial_sequences=False)
+    InsufficientChainLength
+        If the target number of monomers results in no middle monomers being included (i.e. neither full NOR partial sequence repeats)
+    '''
+    # Evaluate sizes of missing components from given values
+    block_size = len(sequence)
+    n_mono_middle = n_monomers_target - n_monomers_terminal # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
+    if n_mono_middle < 0:
+        raise EndGroupDominatedChain(f'Registered number of terminal monomers exceeds requested chain length ({n_monomers_target}-mer chain can\'t possibly contain {n_monomers_terminal} terminal monomers)')
+    
+    n_seq_whole : int         # number of full sequence repeats to reach a number of monomers less than or equal to the target
+    n_symbols_remaining : int # number of any remaining symbols in sequence (i.e. monomers) needed to close the gap to the target (allowed to be 0 if target is a multiple of the sequence length)
+    n_seq_whole, n_symbols_remaining = divmod(n_mono_middle, block_size) 
+
+    # Break down into cases by whether or not a whole number of sequence repeats is possible
+    if n_symbols_remaining != 0: # a whole number of sequence repeats (including possibly 0) plus some fraction of a full block sequence
+        if not allow_partial_sequences:
+            raise PartialBlockSequence(
+                f'Partial polymer block sequence required to meet target number of monomers ("{sequence[:n_symbols_remaining]}" prefix of sequence "{sequence}"). ' \
+                'If this is acceptable, set "allow_partial_sequences=True" and try calling build routine again'
+            )    
+        sequence_procrustean = repeat_string_to_length(sequence, target_length=n_mono_middle, joiner='')
+        n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit)
+    else: # for a purely-whole number of block sequence repeats
+        if n_seq_whole < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced my hand
+            raise InsufficientChainLength(
+                f'{n_monomers_target}-monomer chain cannot accomodate both {n_monomers_terminal} end groups AND at least 1 middle monomer sequence'
+            )
+        sequence_procrustean = sequence # NOTE: rename here is for clarity, and for consistency with partial sequence case
+        n_seq_repeats = n_seq_whole
+        
+    # Generate descriptive log message to summarize sequence modifications
+    ## Determine info present for whole and partial sections
+    desc_seq_counts_parts = []
+    desc_seq_order_middle = []
+    
+    if n_seq_whole != 0: ## Whole sequence strings
+        desc_seq_counts_parts.append(f'{n_seq_whole} whole {block_size}-sequence repeats')
+        desc_seq_order_middle.append(f'{n_seq_whole}*[{sequence}]')
+        
+    if n_symbols_remaining != 0: ## Partial sequence strings
+        desc_seq_counts_parts.append(f'a partial {n_symbols_remaining}/{block_size} sequence repeat')
+        desc_seq_order_middle.append(f'[{sequence[:n_symbols_remaining]}]')
+        
+    ## Finalizing sequence counts descriptor parts
+    tally_str = f'({n_seq_whole}*{block_size} + {n_symbols_remaining}) middle monomers + {n_monomers_terminal} terminal monomers = {n_monomers_target} total monomers)'
+    if len(desc_seq_counts_parts) == 2:
+        desc_seq_counts_parts.insert(1, ' and ') # include conjunction if a mixed (i.e. both whole and fractional) solution was found
+    
+    ## Finalizing sequence order descriptor parts
+    desc_seq_order_parts = ['[END-GROUP]']*n_monomers_terminal # abut with correct amount of end group indicators
+    desc_seq_order_parts[1:-1] = desc_seq_order_middle # insert middle sections for whole and partial sequences
+    
+    ## putting everything together
+    LOGGER.info(f'Target chain length achievable with {"".join(desc_seq_counts_parts)};\n Namely, polymer will be sequenced as {" + ".join(desc_seq_order_parts)}, yielding {tally_str}')
+        
+    return sequence_procrustean, n_seq_repeats

From c66a3f23bd0a2849b0c6c625cc97b20eb13ebc20 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 21:56:09 -0700
Subject: [PATCH 37/78] Fixed missing "raise" keywords and incorrect package
 checks

---
 polymerist/polymers/building/__init__.py  | 2 +-
 polymerist/polymers/building/mbconvert.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/polymerist/polymers/building/__init__.py b/polymerist/polymers/building/__init__.py
index f2b636a..8fdd0b7 100644
--- a/polymerist/polymers/building/__init__.py
+++ b/polymerist/polymers/building/__init__.py
@@ -6,7 +6,7 @@
 from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage
 
 if not modules_installed('mbuild'):
-    MissingPrerequisitePackage(
+    raise MissingPrerequisitePackage(
         importing_package_name=__spec__.name,
         use_case='Polymer building',
         install_link='https://mbuild.mosdef.org/en/stable/getting_started/installation/installation.html',
diff --git a/polymerist/polymers/building/mbconvert.py b/polymerist/polymers/building/mbconvert.py
index 7fc3075..b7ae69f 100644
--- a/polymerist/polymers/building/mbconvert.py
+++ b/polymerist/polymers/building/mbconvert.py
@@ -8,8 +8,8 @@
 
 from ...genutils.importutils.dependencies import modules_installed, MissingPrerequisitePackage
 
-if not modules_installed('mbuild'):
-    MissingPrerequisitePackage(
+if not modules_installed('openbabel'):
+    raise MissingPrerequisitePackage(
         importing_package_name=__spec__.name,
         use_case='Translation between chemical representations of polymers',
         install_link='https://libraries.io/conda/openbabel',

From e69682892241d33a3725f31b114c48be80abcccf Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Thu, 5 Dec 2024 21:56:35 -0700
Subject: [PATCH 38/78] Fiddled with MissingPrerequisitePackage error message
 format

---
 polymerist/genutils/importutils/dependencies.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/polymerist/genutils/importutils/dependencies.py b/polymerist/genutils/importutils/dependencies.py
index 79fa48a..a69b4c8 100644
--- a/polymerist/genutils/importutils/dependencies.py
+++ b/polymerist/genutils/importutils/dependencies.py
@@ -28,7 +28,8 @@ def __init__(self,
         
         message = f'''
         {use_case.capitalize()} require(s) {dependency_name_formal}, which was not found in the current environment
-        Please install `{dependency_name}` by following the installation instructions at {install_link}; then try importing from "{importing_package_name}" again'''
+        Please install `{dependency_name}` by following the installation instructions at {install_link}
+        Then try importing from "{importing_package_name}" again'''
         
         super().__init__(message)
         

From 31f0675584d9ff623c7dfd20558f75932d9c6e4d Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Fri, 6 Dec 2024 16:55:36 -0700
Subject: [PATCH 39/78] Added Exception for unexpectedly-empty copolymer
 sequences

---
 polymerist/polymers/exceptions.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/polymerist/polymers/exceptions.py b/polymerist/polymers/exceptions.py
index b7502cb..e0060f9 100644
--- a/polymerist/polymers/exceptions.py
+++ b/polymerist/polymers/exceptions.py
@@ -3,7 +3,7 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
-
+# CHAIN LENGTH AND SHAPE ERRORS
 class InsufficientChainLength(Exception):
     '''Raised when the polymer molecule being built is too short'''
     pass
@@ -15,14 +15,20 @@ class ExcessiveChainLength(Exception):
 class EndGroupDominatedChain(Exception):
     '''Raised to indicate there are more end groups present in a chain than are monomer possibly allowed'''
 
-class PartialBlockSequence(Exception):
-    '''Raised when an non-whole number of copolymer blocks is needed to reach a target chain length (and is not allowed)'''
-    pass
-
 class MorphologyError(Exception):
     '''Raised when a polymer does not have the morphology (i.e. crosslinking, molecular weight, etc) an application expects'''
     pass
 
+# COPOLYMER SEQUENCING ERRORS
+class EmptyBlockSequence(Exception):
+    '''Raised when a trivial sequence of copolymer block (i.e. the empty string "") is provided when no expected'''
+    pass
+
+class PartialBlockSequence(Exception):
+    '''Raised when an non-whole number of copolymer blocks is needed to reach a target chain length (and is not allowed)'''
+    pass
+
+# POLYMERIZATION MISINFORMATION ERRORS
 class AlreadySolvated(Exception):
     '''Raised when attempting to add solvent to a molecule which already has solvent'''
     pass

From 79296ae9fca19122bc504ccc58e90e3a24fc9ef5 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 14:16:39 -0700
Subject: [PATCH 40/78] Added precheck for empty sequence kernel

---
 polymerist/polymers/building/sequencing.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/polymerist/polymers/building/sequencing.py b/polymerist/polymers/building/sequencing.py
index de94ee3..3a2b8d5 100644
--- a/polymerist/polymers/building/sequencing.py
+++ b/polymerist/polymers/building/sequencing.py
@@ -6,7 +6,7 @@
 import logging
 LOGGER = logging.getLogger(__name__)
 
-from ..exceptions import EndGroupDominatedChain, InsufficientChainLength, PartialBlockSequence
+from ..exceptions import EndGroupDominatedChain, InsufficientChainLength, EmptyBlockSequence, PartialBlockSequence
 from ...genutils.textual.substrings import repeat_string_to_length
 
 
@@ -61,6 +61,9 @@ def procrustean_polymer_sequence_alignment(
         If the target number of monomers results in no middle monomers being included (i.e. neither full NOR partial sequence repeats)
     '''
     # Evaluate sizes of missing components from given values
+    if not sequence:
+            raise EmptyBlockSequence('Must provide non-empty sequence kernel to yield a valid (co)polymer sequence')
+    
     block_size = len(sequence)
     n_mono_middle = n_monomers_target - n_monomers_terminal # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
     if n_mono_middle < 0:

From cb2587a1c050519beff92b37261977b049309706 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 14:38:27 -0700
Subject: [PATCH 41/78] Expanded PROCRUSTEAN sequencing algorithm into
 dedicated dataclass

---
 polymerist/polymers/building/linear.py     |  24 +-
 polymerist/polymers/building/sequencing.py | 245 ++++++++++++++-------
 2 files changed, 179 insertions(+), 90 deletions(-)

diff --git a/polymerist/polymers/building/linear.py b/polymerist/polymers/building/linear.py
index aa726da..62e90a6 100644
--- a/polymerist/polymers/building/linear.py
+++ b/polymerist/polymers/building/linear.py
@@ -14,7 +14,7 @@
     from mbuild.lib.recipes.polymer import Polymer as MBPolymer
 
 from .mbconvert import mbmol_from_mono_rdmol
-from .sequencing import procrustean_polymer_sequence_alignment
+from .sequencing import LinearCopolymerSequencer
 from ..exceptions import MorphologyError
 from ..monomers.repr import MonomerGroup
 from ..estimation import estimate_n_atoms_linear
@@ -25,6 +25,7 @@ def build_linear_polymer(
         monomers : MonomerGroup,
         n_monomers : int,
         sequence : str='A',
+        minimize_sequence : bool=True,
         allow_partial_sequences : bool=False,
         add_Hs : bool=False,
         energy_minimize : bool=False,
@@ -33,11 +34,20 @@ def build_linear_polymer(
     and a degree of polymerization (i.e. chain length in number of monomers)) and returns an mbuild Polymer object'''
     # 1) DETERMINE NUMBER OF SEQUENCE REPEATS NEEDED TO MEET TARGET NUMBER OF MONOMER UNITS (IF POSSIBLE) - DEV: consider making a separate function
     end_groups = monomers.linear_end_groups() # cache end groups so they dont need to be recalculated when registering end groups
-    sequence_compliant, n_seq_repeats = procrustean_polymer_sequence_alignment(
-        sequence,
-        n_monomers_target=n_monomers,
-        n_monomers_terminal=len(end_groups), # number of terminal monomers are actually present and well-defined
-        allow_partial_sequences=allow_partial_sequences,
+    end_group_names = [resname for (resname, _) in end_groups.values()]
+    
+    sequencer = LinearCopolymerSequencer(
+        sequence_kernel=sequence,
+        n_repeat_units=n_monomers,
+        n_repeat_units_terminal=len(end_groups)
+    )
+    if minimize_sequence:
+        sequencer.reduce() # identify minimal subsequences
+    
+    sequence_compliant, n_seq_repeats = sequencer.procrustean_alignment(allow_partial_sequences=allow_partial_sequences)
+    LOGGER.info(
+        f'Target chain length achievable with {sequencer.describe_tally()}, ' \
+        f'namely with the sequence {sequencer.describe_order(end_group_names=end_group_names)}'
     )
     sequence_unique = unique_string(sequence_compliant, preserve_order=True) # only register a new monomer for each appearance of a new, unique symbol in the sequence
     
@@ -46,7 +56,7 @@ def build_linear_polymer(
     monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building
     
     ## 2A) ADD MIDDLE MONOMERS TO CHAIN
-    for (resname, middle_monomer), symbol in zip(monomers.iter_rdmols(term_only=False), sequence_unique): # zip with sequence limits number of middle monomers to length of block sequence
+    for symbol, (resname, middle_monomer) in zip(sequence_unique, monomers.iter_rdmols(term_only=False)): # zip with sequence limits number of middle monomers to length of block sequence
         LOGGER.info(f'Registering middle monomer {resname} (block identifier "{symbol}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname)
         chain.add_monomer(compound=mb_monomer, indices=linker_ids)
diff --git a/polymerist/polymers/building/sequencing.py b/polymerist/polymers/building/sequencing.py
index 3a2b8d5..8693118 100644
--- a/polymerist/polymers/building/sequencing.py
+++ b/polymerist/polymers/building/sequencing.py
@@ -6,113 +6,192 @@
 import logging
 LOGGER = logging.getLogger(__name__)
 
-from ..exceptions import EndGroupDominatedChain, InsufficientChainLength, EmptyBlockSequence, PartialBlockSequence
-from ...genutils.textual.substrings import repeat_string_to_length
+from typing import Iterable, Optional
+from dataclasses import dataclass, field, asdict
 
+from polymerist.polymers.exceptions import EndGroupDominatedChain, InsufficientChainLength, EmptyBlockSequence, PartialBlockSequence
+from polymerist.genutils.textual.substrings import shortest_repeating_substring, repeat_string_to_length
 
-def procrustean_polymer_sequence_alignment(
-        sequence : str,
-        n_monomers_target : int,
-        n_monomers_terminal : int,
-        allow_partial_sequences : bool=False
-    ) -> tuple[str, int]:
+
+@dataclass
+class LinearCopolymerSequencer:
     '''
-    For a given polymer block sequence "S", target linear chain length, and number of terminal monomers,
-    Returns a sequence "P" and number of repeats "r" which, taken together, satisfy the following:
-    - The number of monomers in r repeats of P plus the number of terminal monomers is precisely equal to the target number of monomers
-    - The symbols in sequence P cycle through the symbols in S, in the order they appear in S
-    - The number of times S is cycles through in P is always a rational multiple of the length of S
-    If no satisfiable sequence-count pair can be found, raises an appropriate informative exception
-    
-    Named to reflect the fact that the original sequence S will be stretched or truncated to fit the given target sequence length
+    For encapsulating information about the sequence of repeat units in a periodic, linear copolymer
+    Also covers, as trivial special cases, homopolymers and alternating copolymers
     
     Parameters
     ----------
-    sequence : str
+    sequence_kernel : str
         A sequence indicating a periodic ordering of monomers in a linear polymer block (e.g. "A", "ABAC", etc)
         Each unique symbol in the sequence corresponds to a distinct monomer in the block
-    n_monomers_target : int
-        The desired number of monomers (including terminal monomers) in a polymer chain
+    n_repeat_units : int
+        The desired total number of monomers (including terminal monomers) in a polymer chain
     n_monomers_terminal : int
         The number of terminal monomers ("end groups") which are to be included in the chain
         in addition to the middle monomers described by "sequence"
-    allow_partial_sequences : bool, default False
-        Whether to allow fractional repeats of the original sequence in order to meet the target number of monomers
         
-        For example, to construct a 12-mer chain with 2 end groups from the sequence "BACA", one would require 10 middle monomers
-        which can only be achieved with 2.5 (10/4) sequence repeats, namely as "BACA|BACA|BA"; 
-
-        This behavior may or may not be desired, depending on the use case, and can be controlled by this flag
-    
-    Returns
-    -------
-    sequence_procrustean : str
-        A possibly modified version of the original polymer block sequence
-    n_seq_repeats : int
-        The number of times "sequence_procrustean" must be repeated to achieve the target sequence length
-    
     Raises
     ------
+    EmpyBlockSequence
+        The sequence provided is empty (can't be used to define nonzero-length chain)
     End GroupDominatedChain
         The number of terminal monomers exceed the number of total monomers
-    PartialBlockSequence
-        If a partial sequence repeat is required but disallowed (by setting allow_partial_sequences=False)
-    InsufficientChainLength
-        If the target number of monomers results in no middle monomers being included (i.e. neither full NOR partial sequence repeats)
     '''
-    # Evaluate sizes of missing components from given values
-    if not sequence:
-            raise EmptyBlockSequence('Must provide non-empty sequence kernel to yield a valid (co)polymer sequence')
+    sequence_kernel : str
+    n_repeat_units : int
+    n_repeat_units_terminal : int = 0
     
-    block_size = len(sequence)
-    n_mono_middle = n_monomers_target - n_monomers_terminal # number of terminal monomers needed to reach target; in a linear chain, all monomers are either middle or terminal
-    if n_mono_middle < 0:
-        raise EndGroupDominatedChain(f'Registered number of terminal monomers exceeds requested chain length ({n_monomers_target}-mer chain can\'t possibly contain {n_monomers_terminal} terminal monomers)')
+    # Attribute checks and modifications
+    def __post_init__(self) -> None:
+        if not self.sequence_kernel:
+            raise EmptyBlockSequence('Must provide non-empty sequence kernel to yield a valid (co)polymer sequence')
     
-    n_seq_whole : int         # number of full sequence repeats to reach a number of monomers less than or equal to the target
-    n_symbols_remaining : int # number of any remaining symbols in sequence (i.e. monomers) needed to close the gap to the target (allowed to be 0 if target is a multiple of the sequence length)
-    n_seq_whole, n_symbols_remaining = divmod(n_mono_middle, block_size) 
-
-    # Break down into cases by whether or not a whole number of sequence repeats is possible
-    if n_symbols_remaining != 0: # a whole number of sequence repeats (including possibly 0) plus some fraction of a full block sequence
-        if not allow_partial_sequences:
-            raise PartialBlockSequence(
-                f'Partial polymer block sequence required to meet target number of monomers ("{sequence[:n_symbols_remaining]}" prefix of sequence "{sequence}"). ' \
-                'If this is acceptable, set "allow_partial_sequences=True" and try calling build routine again'
-            )    
-        sequence_procrustean = repeat_string_to_length(sequence, target_length=n_mono_middle, joiner='')
-        n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit)
-    else: # for a purely-whole number of block sequence repeats
-        if n_seq_whole < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced my hand
-            raise InsufficientChainLength(
-                f'{n_monomers_target}-monomer chain cannot accomodate both {n_monomers_terminal} end groups AND at least 1 middle monomer sequence'
+        if self.n_repeat_units_middle < 0:
+            raise EndGroupDominatedChain(
+                f'Number of terminal monomers exceeds requested chain length; ({self.n_repeat_units}-mer ' \
+                f'chain can\'t possibly contain {self.n_repeat_units_terminal} terminal monomers)'
             )
-        sequence_procrustean = sequence # NOTE: rename here is for clarity, and for consistency with partial sequence case
-        n_seq_repeats = n_seq_whole
+            
+    def copy(self) -> 'LinearCopolymerSequencer':
+        '''Returns another equivalent instance of the current sequence info more efficiently than a complete deepcopy'''
+        return self.__class__(**asdict(self))
+            
+    def reduce(self) -> None:
+        '''
+        Determines if there is a shorter repeating subsequence making up the current sequence kernel
+        If there is, adjusts the sequence kernel to that minimal sequence; does nothing otherwise
         
-    # Generate descriptive log message to summarize sequence modifications
-    ## Determine info present for whole and partial sections
-    desc_seq_counts_parts = []
-    desc_seq_order_middle = []
+        Reduction is idempotent, and guarantees that the smallest possible kernel is used when sequencing
+        '''
+        minimal_subsequence = shortest_repeating_substring(self.sequence_kernel)
+        kernel_period = self.block_size // len(minimal_subsequence) # account for any periodic shortening WITHIN the kernel
+        
+        if kernel_period == 1:
+            LOGGER.info(f'Sequence kernel "{self.sequence_kernel}" is already fully reduced; no changes made')
+            return
+        else:
+            LOGGER.info(
+                f'Sequence kernel "{self.sequence_kernel}" can be further decomposed as {kernel_period}*"{minimal_subsequence}"; ' \
+                f'Setting kernel to minimal subsequence "{minimal_subsequence}"'
+            )
+            self.sequence_kernel = minimal_subsequence
     
-    if n_seq_whole != 0: ## Whole sequence strings
-        desc_seq_counts_parts.append(f'{n_seq_whole} whole {block_size}-sequence repeats')
-        desc_seq_order_middle.append(f'{n_seq_whole}*[{sequence}]')
+    def reduced(self) -> 'LinearCopolymerSequencer':
+        '''Return a sequence-reduced version of the current sequence info'''
+        clone = self.copy()
+        clone.reduce()
         
-    if n_symbols_remaining != 0: ## Partial sequence strings
-        desc_seq_counts_parts.append(f'a partial {n_symbols_remaining}/{block_size} sequence repeat')
-        desc_seq_order_middle.append(f'[{sequence[:n_symbols_remaining]}]')
+        return clone
         
-    ## Finalizing sequence counts descriptor parts
-    tally_str = f'({n_seq_whole}*{block_size} + {n_symbols_remaining}) middle monomers + {n_monomers_terminal} terminal monomers = {n_monomers_target} total monomers)'
-    if len(desc_seq_counts_parts) == 2:
-        desc_seq_counts_parts.insert(1, ' and ') # include conjunction if a mixed (i.e. both whole and fractional) solution was found
+    # Properties derived from sequence kernel and target chain lengths
+   
+    @property
+    def n_repeat_units_middle(self) -> int:
+        '''Number of middle (i.e. non-terminal) repeat units'''
+        return self.n_repeat_units - self.n_repeat_units_terminal
+
+    # Whole sequence periods
+    @property
+    def block_size(self) -> int:
+        '''Number of repeat units units in one whole iteration of the kernel block'''
+        return len(self.sequence_kernel)
+    period = block_size
+    
+    @property
+    def n_full_periods(self) -> int:
+        '''
+        Largest number of complete repetitions of the sequence kernel which, when taken
+        together, contain no more repeats units than the specified number of middle units
+        '''
+        return self.n_repeat_units_middle // self.block_size
+    
+    # Partial sequence residues
+    @property
+    def n_residual_repeat_units(self) -> int:
+        '''
+        Difference between number of middle repeat units and units which
+        would occur in maximal full periods of the kernel
+        
+        By construction, is no greater than the block size and is
+        identically zero exactly when a whole number of kernel repeats
+        '''
+        return self.n_repeat_units_middle % self.block_size
+    n_residual_symbols = n_res = n_residual_repeat_units
+    
+    @property
+    def has_residual(self) -> bool:
+        '''
+        Whether or not the target number of middle repeat units
+        can be attained by a whole number of kernel repeats
+        '''
+        return bool(self.n_residual_repeat_units)
     
-    ## Finalizing sequence order descriptor parts
-    desc_seq_order_parts = ['[END-GROUP]']*n_monomers_terminal # abut with correct amount of end group indicators
-    desc_seq_order_parts[1:-1] = desc_seq_order_middle # insert middle sections for whole and partial sequences
+    @property
+    def sequence_residual(self) -> str:
+        '''Partial repeat of the kernel sequence needed to attain the speficied number of middle units'''
+        return self.sequence_kernel[:self.n_residual_repeat_units]
+    residual = sequence_residual
     
-    ## putting everything together
-    LOGGER.info(f'Target chain length achievable with {"".join(desc_seq_counts_parts)};\n Namely, polymer will be sequenced as {" + ".join(desc_seq_order_parts)}, yielding {tally_str}')
+    ## PROCRUSTEAN sequence alignment
+    def procrustean_alignment(self, allow_partial_sequences : bool=False) -> tuple[str, int]:
+        '''
+        PROCRUSTEAN: Periodic Repetition Of Cyclic Repeat Unit Sequences, Truncated to an Exact and Arbitrary Number
+        Stretches or truncates the sequence kernel to achieve a target sequence length
         
-    return sequence_procrustean, n_seq_repeats
+        Algorithm produces a sequence string "P" and number of repeats "r" which, taken together, satisfy the following:
+        - The number of units in r repeats of P plus the number of terminal monomers is precisely equal to the target number of monomers
+        - The units in P cycle through the units in S, in the order they appear in S
+        - The number of times S is cycled through in P is always a rational multiple of the length of S
+        If no satisfiable sequence-count pair can be found, raises an appropriate informative exception
+        '''
+        if not self.has_residual: # the case where the target length happens to consist of a whole-number of repeats of the kernel
+            if self.n_full_periods < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced my hand
+                raise InsufficientChainLength(
+                    f'{self.n_repeat_units}-monomer chain cannot accomodate both {self.n_repeat_units_terminal} end groups AND at least 1 middle monomer sequence'
+                )
+            sequence_procrustean = self.sequence_kernel
+            n_seq_repeats = self.n_full_periods
+        else:
+            if not allow_partial_sequences:
+                raise PartialBlockSequence(
+                    f'Partial polymer block sequence required to meet target number of monomers ("{self.residual}" prefix of sequence "{self.sequence_kernel}");\n' \
+                    'If this is acceptable, set "allow_partial_sequences=True" and try calling build routine again'
+                )    
+            sequence_procrustean = repeat_string_to_length(self.sequence_kernel, target_length=self.n_repeat_units_middle, joiner='')
+            n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit)
+            
+        return sequence_procrustean, n_seq_repeats
+    
+    def describe_order(self, end_group_names : Optional[Iterable[str]]=None, default_end_group_name : str='END-GROUP') -> str:
+        '''Descriptive string presenting a condensed view of the order of repeat units in the final sequence'''
+        # Assign names for end groups
+        if end_group_names is None:
+            end_group_names = [f'[{default_end_group_name}]']*self.n_repeat_units_terminal 
+        else:
+            end_group_names = [f'[{end_group_name}]' for end_group_name in end_group_names] # unpack into list and enforce correct number of names
+        if (num_names_provided := len(end_group_names)) != self.n_repeat_units_terminal: # DEV: consider supporting filling in missing names with default in future
+            raise IndexError(f'Defined sequence info with {self.n_repeat_units_terminal} end groups, but only provided names for {num_names_provided}')
+        
+        # Insert middle omnomer parts as necessary
+        sequence_middle = []
+        if self.n_full_periods != 0: ## Whole sequence strings
+            sequence_middle.append(f'{self.n_full_periods}*[{self.sequence_kernel}]')
+        if self.has_residual: ## Partial sequence strings
+            sequence_middle.append(f'[{self.residual}]')
+            
+        # Abut with correct amount of end group indicators
+        sequence_parts = end_group_names[:] 
+        sequence_parts[1:-1] = sequence_middle
+        
+        return ' + '.join(sequence_parts)
+    
+    def describe_tally(self) -> str:
+        '''Descriptive string indicating how all parts of the overall sequence contribute to the target number of repeat units'''
+        desc_seq_counts_parts = []
+        if self.n_full_periods != 0: ## Whole sequence strings
+            desc_seq_counts_parts.append(f'{self.n_full_periods} whole {self.block_size}-sequence repeat(s)')
+        if self.has_residual: ## Partial sequence strings
+            desc_seq_counts_parts.append(f'a partial {self.n_residual_repeat_units}/{self.block_size} sequence repeat')
+            
+        return ' and '.join(desc_seq_counts_parts)
+        
\ No newline at end of file

From 742522e4e602095e8ae2f562fecc7804cd0ae9cf Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 14:49:26 -0700
Subject: [PATCH 42/78] Made LinearCopolymerSequencer serializable to/from JSON

---
 polymerist/polymers/building/sequencing.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/polymerist/polymers/building/sequencing.py b/polymerist/polymers/building/sequencing.py
index 8693118..60b48ab 100644
--- a/polymerist/polymers/building/sequencing.py
+++ b/polymerist/polymers/building/sequencing.py
@@ -9,10 +9,12 @@
 from typing import Iterable, Optional
 from dataclasses import dataclass, field, asdict
 
-from polymerist.polymers.exceptions import EndGroupDominatedChain, InsufficientChainLength, EmptyBlockSequence, PartialBlockSequence
-from polymerist.genutils.textual.substrings import shortest_repeating_substring, repeat_string_to_length
+from ...genutils.textual.substrings import shortest_repeating_substring, repeat_string_to_length
+from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable
+from ..exceptions import EndGroupDominatedChain, InsufficientChainLength, EmptyBlockSequence, PartialBlockSequence
 
 
+@make_jsonifiable
 @dataclass
 class LinearCopolymerSequencer:
     '''

From 013e09bdc46924af2289b8978c85f56d8d3af38a Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 14:49:43 -0700
Subject: [PATCH 43/78] Added RDKit-driven PDB writer for mbuild Compounds

---
 polymerist/polymers/building/__init__.py  |  5 +-
 polymerist/polymers/building/mbconvert.py | 77 ++++++++++++++---------
 2 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/polymerist/polymers/building/__init__.py b/polymerist/polymers/building/__init__.py
index 8fdd0b7..dfe3e3d 100644
--- a/polymerist/polymers/building/__init__.py
+++ b/polymerist/polymers/building/__init__.py
@@ -15,4 +15,7 @@
     )
     
 from .linear import build_linear_polymer
-from .mbconvert import mbmol_to_openmm_pdb, mbmol_from_mono_rdmol, mbmol_to_rdmol
\ No newline at end of file
+from .mbconvert import (
+    mbmol_from_mono_rdmol, mbmol_to_rdmol,
+    mbmol_to_openmm_pdb, mbmol_to_rdkit_pdb,
+)
\ No newline at end of file
diff --git a/polymerist/polymers/building/mbconvert.py b/polymerist/polymers/building/mbconvert.py
index b7ae69f..ebef303 100644
--- a/polymerist/polymers/building/mbconvert.py
+++ b/polymerist/polymers/building/mbconvert.py
@@ -31,13 +31,13 @@
     from mbuild.conversion import from_rdkit
     
 from ..monomers.specification import SANITIZE_AS_KEKULE
-from ...genutils.decorators.functional import allow_string_paths
+from ...genutils.decorators.functional import allow_string_paths, allow_pathlib_paths
 from ...rdutils.bonding.portlib import get_linker_ids
 from ...rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports
 from ...mdtools.openmmtools.serialization import serialize_openmm_pdb
 
 
-
+# Conversion from other formats to Compound
 def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tuple[Compound, list[int]]:
     '''
     Accepts a monomer-spec-compliant SMARTS string and returns an mbuild Compound and a list of the indices of atom ports
@@ -55,32 +55,9 @@ def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tup
         mb_compound.name = resname
 
     return mb_compound, linker_ids
-
-@allow_string_paths
-def mbmol_to_openmm_pdb(
-        pdb_path : Path,
-        mbmol : Compound, 
-        num_atom_digits : int=2,
-        resname_map : Optional[dict[str, str]]=None,
-    ) -> None:
-    '''Save an MBuild Compound into an OpenMM-compatible PDB file'''
-    if resname_map is None: # avoid mutable default
-        resname_map = {'RES' : 'Pol'} 
-
-    traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format)
-    omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory
-
-    serialize_openmm_pdb(
-        pdb_path,
-        topology=omm_top,
-        positions=omm_pos,
-        uniquify_atom_ids=True,
-        num_atom_id_digits=num_atom_digits,
-        resname_map=resname_map
-    )
-    
-# TODO: deduplify PDB atom anme and residue numbering code against serialize_openmm_pdb()
-def mbmol_to_rdmol(
+   
+# Conversion from Compound to other formats
+def mbmol_to_rdmol( # TODO: deduplify PDB atom name and residue numbering code against serialize_openmm_pdb()
         mbmol : Compound,
         uniquify_atom_ids : bool=False,
         num_atom_id_digits : int=2,
@@ -123,4 +100,46 @@ def mbmol_to_rdmol(
             atom_id += 1 # TODO: this is an awful waay of keeping track of atom indices, see if there's a more secure way to do this
     conf_id = rdmol.AddConformer(conformer)
     
-    return rdmol
\ No newline at end of file
+    return rdmol
+
+# Serialization of Compounds to files
+@allow_string_paths
+def mbmol_to_openmm_pdb(
+        pdb_path : Path,
+        mbmol : Compound, 
+        num_atom_digits : int=2,
+        resname_map : Optional[dict[str, str]]=None,
+    ) -> None:
+    '''Save an MBuild Compound into an OpenMM-formatted PDB file'''
+    if resname_map is None: # avoid mutable default
+        resname_map = {'RES' : 'Pol'} 
+
+    traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format)
+    omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory
+
+    serialize_openmm_pdb(
+        pdb_path,
+        topology=omm_top,
+        positions=omm_pos,
+        uniquify_atom_ids=True,
+        num_atom_id_digits=num_atom_digits,
+        resname_map=resname_map
+    )
+
+@allow_pathlib_paths
+def mbmol_to_rdkit_pdb(
+        pdb_path : str,
+        mbmol : Compound, 
+        num_atom_digits : int=2,
+        resname_map : Optional[dict[str, str]]=None,
+    ) -> None:
+    '''Save an MBuild Compound into an RDKit-formatted PDB file'''
+    Chem.MolToPDBFile(
+        mbmol_to_rdmol(
+            mbmol,
+            uniquify_atom_ids=True,
+            num_atom_id_digits=num_atom_digits,
+            resname_map=resname_map
+        ),
+        pdb_path,
+    )
\ No newline at end of file

From fb3d898f2933eeacf641ef5de6a97d413dce2ae4 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 14:56:54 -0700
Subject: [PATCH 44/78] Expanded out unit test modules for .polymers.building

---
 polymerist/tests/polymers/building/__init__.py            | 4 ++++
 .../{test_building.py => building/test_linear.py}         | 6 ++----
 polymerist/tests/polymers/building/test_sequencing.py     | 8 ++++++++
 3 files changed, 14 insertions(+), 4 deletions(-)
 create mode 100644 polymerist/tests/polymers/building/__init__.py
 rename polymerist/tests/polymers/{test_building.py => building/test_linear.py} (65%)
 create mode 100644 polymerist/tests/polymers/building/test_sequencing.py

diff --git a/polymerist/tests/polymers/building/__init__.py b/polymerist/tests/polymers/building/__init__.py
new file mode 100644
index 0000000..f4f43b4
--- /dev/null
+++ b/polymerist/tests/polymers/building/__init__.py
@@ -0,0 +1,4 @@
+'''Unit tests for `building` package'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
\ No newline at end of file
diff --git a/polymerist/tests/polymers/test_building.py b/polymerist/tests/polymers/building/test_linear.py
similarity index 65%
rename from polymerist/tests/polymers/test_building.py
rename to polymerist/tests/polymers/building/test_linear.py
index 799123f..f9f4721 100644
--- a/polymerist/tests/polymers/test_building.py
+++ b/polymerist/tests/polymers/building/test_linear.py
@@ -1,4 +1,4 @@
-'''Unit tests for `attrs` package'''
+'''Tests construction of structures for linear copolymers (and relevant subfamilies, e.g. homopolymers)'''
 
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
@@ -13,6 +13,4 @@
 
 @pytest.fixture
 def fragments_path() -> Path:
-    return get_file_path_within_package('peg=pla-pga.json', testdata)
-
-# Also add separate tests module for polymers.estimation
+    return get_file_path_within_package('peg-pla-pga.json', testdata)
\ No newline at end of file
diff --git a/polymerist/tests/polymers/building/test_sequencing.py b/polymerist/tests/polymers/building/test_sequencing.py
new file mode 100644
index 0000000..2080dc7
--- /dev/null
+++ b/polymerist/tests/polymers/building/test_sequencing.py
@@ -0,0 +1,8 @@
+'''Testing that copolymer sequencing scales (and fails) as expected'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+import pytest
+from pathlib import Path
+

From 677fedeb996f397219b069faf623dfb40c1c9e6e Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 15:42:09 -0700
Subject: [PATCH 45/78] Updated description of the "PROCRUSTEAN" acronym

---
 polymerist/polymers/building/sequencing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/polymerist/polymers/building/sequencing.py b/polymerist/polymers/building/sequencing.py
index 60b48ab..cd52c69 100644
--- a/polymerist/polymers/building/sequencing.py
+++ b/polymerist/polymers/building/sequencing.py
@@ -137,8 +137,8 @@ def sequence_residual(self) -> str:
     ## PROCRUSTEAN sequence alignment
     def procrustean_alignment(self, allow_partial_sequences : bool=False) -> tuple[str, int]:
         '''
-        PROCRUSTEAN: Periodic Repetition Of Cyclic Repeat Unit Sequences, Truncated to an Exact and Arbitrary Number
-        Stretches or truncates the sequence kernel to achieve a target sequence length
+        PROCRUSTEAN: Periodic Recurrence Of Cyclic Repeat Unit Sequences, Truncated to an Exact and Arbitrary Number
+        Stretches or truncates the sequence kernel to achieve a target sequence length, cycling through the kernel's period as many times as needed
         
         Algorithm produces a sequence string "P" and number of repeats "r" which, taken together, satisfy the following:
         - The number of units in r repeats of P plus the number of terminal monomers is precisely equal to the target number of monomers

From 870a1eb560f892960e9020a38ded24edc759aaff Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 15:42:19 -0700
Subject: [PATCH 46/78] Wrote unit tests fo copolymer sequencing

---
 .../polymers/building/test_sequencing.py      | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/polymerist/tests/polymers/building/test_sequencing.py b/polymerist/tests/polymers/building/test_sequencing.py
index 2080dc7..272fd4d 100644
--- a/polymerist/tests/polymers/building/test_sequencing.py
+++ b/polymerist/tests/polymers/building/test_sequencing.py
@@ -3,6 +3,121 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
+from typing import Any
+from dataclasses import asdict
+
 import pytest
 from pathlib import Path
 
+from polymerist.polymers.building.sequencing import LinearCopolymerSequencer as LCS
+from polymerist.polymers.exceptions import EmptyBlockSequence, PartialBlockSequence, InsufficientChainLength, EndGroupDominatedChain
+
+
+@pytest.fixture
+def sequencer() -> LCS:
+    '''A sample sequencer with known, valid inputs'''
+    return LCS(sequence_kernel='ABAB', n_repeat_units=14, n_repeat_units_terminal=2)
+
+@pytest.mark.parametrize(
+    'inputs',
+    [
+        {
+            'sequence_kernel' : 'AB',
+            'n_repeat_units' : 10,
+            'n_repeat_units_terminal' : 1
+        },
+        pytest.param(
+            {
+                'sequence_kernel' : 'BAC',
+                'n_repeat_units' : 1,
+                'n_repeat_units_terminal' : 2
+            },
+            marks=pytest.mark.xfail(
+                raises=EndGroupDominatedChain,
+                reason='Results in (unsatisfiable) negative number of middle monomers',
+                strict=True,
+            )
+        ),
+        pytest.param(
+            {
+                'sequence_kernel' : '',
+                'n_repeat_units' : 7,
+                'n_repeat_units_terminal' : 1
+            },
+            marks=pytest.mark.xfail(
+                raises=EmptyBlockSequence,
+                reason='No sequence kernel provided',
+                strict=True,
+            )
+        ), 
+    ]
+)
+def test_LCS_input_validation(inputs : dict[str, Any]) -> None:
+    '''Test that invalid Sequence input are correctly rejected'''
+    _ = LCS(**inputs) # no assert needed, just checking when initialization completes
+    
+def test_LCS_copying(sequencer : LCS) -> None:
+    '''Test that sequencers are properly copied in a read-only manner'''
+    sequencer_clone = sequencer.copy()
+    
+    # tamper with the parameters of the copy in a way that guarantees distinctness
+    sequencer_clone.sequence_kernel = 2*sequencer.sequence_kernel
+    sequencer_clone.n_repeat_units += 2
+    sequencer_clone.n_repeat_units_terminal += 1
+    
+    # check that the original WASN'T tampered with
+    assert asdict(sequencer) != asdict(sequencer_clone)
+    
+    
+@pytest.mark.parametrize(
+    'sequencer, expected_kernel',
+    [
+        (LCS('ABC', n_repeat_units=12), 'ABC') , # test irrreducible case
+        (LCS('ABAB', n_repeat_units=12), 'AB'), # test unreduced case
+    ]
+)
+def test_LCS_reduction(sequencer : LCS, expected_kernel : str) -> None:
+    '''Test that shortest repeating subsequences of sequencer kernels are correctly identified'''
+    sequencer.reduce()
+    assert sequencer.sequence_kernel == expected_kernel
+    
+@pytest.mark.parametrize(
+    'sequencer, allow_partials, expected_sequence, expected_length',
+    [
+        # tests for homopolymers
+        (LCS('A', 5, 1), True , 'A', 4),
+        (LCS('A', 5, 1), False, 'A', 4), # partial block single-monomer sequence will never exist, so "allow_partial_sequences" setting shouldn't matter)
+        pytest.param(
+            LCS('A', 1, 1), True, 'A', 1, # test that all-end group (i.e. no middle monomer) case is correctly rejected
+            marks=pytest.mark.xfail(
+                raises=InsufficientChainLength,
+                reason='No middle monomers can be accomodated',
+                strict=True,
+            ),
+        ),
+        # tests for "true" copolymers
+        (LCS('ABC', 10, 2), True, 'ABCABCAB', 1),
+        pytest.param(
+            LCS('ABC', 10, 2), False, 'ABCABCAB', 1, # test that partial-sequence ban correctly blocks partial sequences...
+            marks=pytest.mark.xfail(
+                raises=PartialBlockSequence,
+                reason='Partial sequence repeats have not been allowed',
+                strict=True,
+            ),
+        ),
+        (LCS('ABC', 11, 2), False, 'ABC', 3), # ...unless the resulting sequence happens to be a whole multiple
+        pytest.param(
+            LCS('ABC', 2, 2), True, '', 1, # test that all-end group (i.e. no middle monomer) case is correctly rejected...
+            marks=pytest.mark.xfail(
+                raises=InsufficientChainLength,
+                reason='No middle monomers can be accomodated',
+                strict=True,
+            ),
+        ),
+        (LCS('ABC', 4, 2), True, 'AB', 1), # ... and finally, check that nonempty sequences SMALLER than the kernel are also recognized if partials are permitted
+    ]
+)
+def test_LCS_procrustean_alignment(sequencer : LCS, allow_partials : bool, expected_sequence : str, expected_length : int) -> None:
+    '''Test capability (and prechecks) for fitting sequence to target chain length'''
+    seq, n_reps = sequencer.procrustean_alignment(allow_partial_sequences=allow_partials)
+    assert (seq == expected_sequence) and (n_reps == expected_length)
\ No newline at end of file

From ce9c55c98c69e506f51a5afcbe93073bd11532eb Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 15:56:57 -0700
Subject: [PATCH 47/78] Updates SMILES/SMARTS-related type annotations on
 validation functions

---
 polymerist/smileslib/primitives.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/polymerist/smileslib/primitives.py b/polymerist/smileslib/primitives.py
index 844cad9..91c87a6 100644
--- a/polymerist/smileslib/primitives.py
+++ b/polymerist/smileslib/primitives.py
@@ -13,11 +13,11 @@
 Smiles : TypeAlias = str # purely for improving self-documentation of functions, no benefit to static type-checkers
 Smarts : TypeAlias = str # purely for improving self-documentation of functions, no benefit to static type-checkers
 
-def is_valid_SMARTS(smarts : str) -> bool:
+def is_valid_SMARTS(smarts : Smarts) -> bool:
     '''Check if SMARTS string is valid (according to RDKit)'''
     return (Chem.MolFromSmarts(smarts) is not None)
 
-def is_valid_SMILES(smiles : str) -> bool:
+def is_valid_SMILES(smiles : Smiles) -> bool:
     '''Check if SMARTS string is valid (according to RDKit)'''
     return (Chem.MolFromSmiles(smiles) is not None)
 

From 75fcceebb0322734d51b20c4d411378a93ff6343 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 16:05:16 -0700
Subject: [PATCH 48/78] Moved fragment data directly into code, as opposed to
 maintaining seprate data file

---
 polymerist/tests/data/peg-pla-pga.json | 38 --------------------------
 polymerist/tests/polymers/__init__.py  | 12 ++++++++
 2 files changed, 12 insertions(+), 38 deletions(-)
 delete mode 100644 polymerist/tests/data/peg-pla-pga.json

diff --git a/polymerist/tests/data/peg-pla-pga.json b/polymerist/tests/data/peg-pla-pga.json
deleted file mode 100644
index a865ed5..0000000
--- a/polymerist/tests/data/peg-pla-pga.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-    "__class__": "MonomerGroup",
-    "__values__": {
-        "monomers": {
-            "PEG-1A": [
-                "[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7])-[#1D1+0:5]"
-            ],
-            "PEG-1B": [
-                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#8D2+0:5]-[#1D1+0:10])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]"
-            ],
-            "PEG-2": [
-                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[*:5])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]"
-            ],
-            "PLA-1A": [
-                "[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:4](=[#8D1+0:5])-[*:6])-[#1D1+0:8])-[#1D1+0:7]"
-            ],
-            "PLA-1B": [
-                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[#8D2+0:7]-[#1D1+0:12])-[#1D1+0:8]"
-            ],
-            "PLA-2": [
-                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[*:7])-[#1D1+0:8]"
-            ],
-            "PGA-1A": [
-                "[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]"
-            ],
-            "PGA-1B": [
-                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]"
-            ],
-            "PGA-2": [
-                "[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]"
-            ]
-        },
-        "term_orient": {
-            "head": "PEG-1A",
-            "tail": "PEG_1B"
-        }
-    }
-}
\ No newline at end of file
diff --git a/polymerist/tests/polymers/__init__.py b/polymerist/tests/polymers/__init__.py
index f37a37e..3adfea5 100644
--- a/polymerist/tests/polymers/__init__.py
+++ b/polymerist/tests/polymers/__init__.py
@@ -2,3 +2,15 @@
 
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
+
+PEG_PLGA_FRAGMENTS : dict[str, list[str]] = { # fragments for all variants of PEG-PLGA-like polymers for testing 
+    'PEG-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7])-[#1D1+0:5]'],
+    'PEG-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#8D2+0:5]-[#1D1+0:10])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'],
+    'PEG-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[*:5])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'],
+    'PLA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:4](=[#8D1+0:5])-[*:6])-[#1D1+0:8])-[#1D1+0:7]'],
+    'PLA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[#8D2+0:7]-[#1D1+0:12])-[#1D1+0:8]'],
+    'PLA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[*:7])-[#1D1+0:8]'],
+    'PGA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]'],
+    'PGA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]'],
+    'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]']
+}

From 59f988aad938f262d162942192e39cd4b5a3cc0b Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 16:05:34 -0700
Subject: [PATCH 49/78] Removed superfluous mBuild imports

---
 polymerist/polymers/building/linear.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/polymerist/polymers/building/linear.py b/polymerist/polymers/building/linear.py
index 62e90a6..b1a5fd4 100644
--- a/polymerist/polymers/building/linear.py
+++ b/polymerist/polymers/building/linear.py
@@ -9,8 +9,6 @@
 import warnings
 with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings
     warnings.filterwarnings('ignore',  category=DeprecationWarning)
-    import mbuild as mb
-    from mbuild import Compound
     from mbuild.lib.recipes.polymer import Polymer as MBPolymer
 
 from .mbconvert import mbmol_from_mono_rdmol

From 29d3198bfe71ed7edf706d547d01b288841cd6d9 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 16:11:57 -0700
Subject: [PATCH 50/78] Added devnote to revisit SMARTS-specification
 auto-cleaning

---
 polymerist/polymers/monomers/repr.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index 174bd80..6003228 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -46,7 +46,8 @@ def __post_init__(self) -> None:
             # check that all SMARTS are valid
             for i, smarts in enumerate(smarts_list): # we can now be sure that this is a list of SMARTS strings
                 if not is_valid_SMARTS(smarts):
-                    raise ValueError(f'Provided invalid monomer SMARTS string for {resname}[{i}]: "{smarts}"')               
+                    raise ValueError(f'Provided invalid monomer SMARTS string for {resname}[{i}]: "{smarts}"')      
+                # DEV: decide whether or not SMILES expansion and spec-compliance should be enforced here or shunted off to the user 
         # DEV: opted to forgo term_orient check for now, as modifying this violates the read-only data model aimed for here
                 
     @staticmethod

From 7afceed37608e8e8a8c74b37f02e15acd2357975 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 16:23:46 -0700
Subject: [PATCH 51/78] Added devnote for spec compliance checker

---
 polymerist/polymers/monomers/specification.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/polymerist/polymers/monomers/specification.py b/polymerist/polymers/monomers/specification.py
index 0731105..8561dd4 100644
--- a/polymerist/polymers/monomers/specification.py
+++ b/polymerist/polymers/monomers/specification.py
@@ -98,6 +98,7 @@ def compliant_atom_query_from_re_match(match : re.Match) -> str:
 
 
 # CONVERSION METHODS
+## DEV: add function to check whether a given SMARTS is COMPLETELY spec-compliant
 def compliant_mol_SMARTS(smarts : str) -> str:
     '''Convert generic SMARTS string into a spec-compliant one'''
     # initial checks

From d6d5880a6037c0f1fdde7f58019280af62037060 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 16:52:45 -0700
Subject: [PATCH 52/78] Added MPD-TMC polyamide fragments for examples

---
 polymerist/tests/polymers/__init__.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/polymerist/tests/polymers/__init__.py b/polymerist/tests/polymers/__init__.py
index 3adfea5..a5126d8 100644
--- a/polymerist/tests/polymers/__init__.py
+++ b/polymerist/tests/polymers/__init__.py
@@ -3,14 +3,27 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
-PEG_PLGA_FRAGMENTS : dict[str, list[str]] = { # fragments for all variants of PEG-PLGA-like polymers for testing 
+PEG_PLGA_FRAGMENTS : dict[str, list[str]] = { # fragments for all variants of PEG-PLGA-like polymers
+    # PEG (ethylene glycol)
     'PEG-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7])-[#1D1+0:5]'],
     'PEG-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#8D2+0:5]-[#1D1+0:10])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'],
     'PEG-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[*:5])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'],
+    # PLA (lactic acid)
     'PLA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:4](=[#8D1+0:5])-[*:6])-[#1D1+0:8])-[#1D1+0:7]'],
     'PLA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[#8D2+0:7]-[#1D1+0:12])-[#1D1+0:8]'],
     'PLA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[*:7])-[#1D1+0:8]'],
+    # PGA (glycolic acid)
     'PGA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]'],
     'PGA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]'],
-    'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]']
+    'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'],
+}
+
+MPD_TMC_FRAGMENTS : dict[str, list[str]] = { # fragments for common polyamide membrane
+    # MPD (m-phenyl diamine)
+    'MPD-1': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:11])-[#6D3+0:5](-[#1D1+0:12])=[#6D3+0:6](-[#1D1+0:13])-[#6D3+0:7](-[#1D1+0:14])=[#6D3+0:8]-1-[#7D3+0:9](-[#1D1+0:15])-[#1D1+0:16])-[#1D1+0:10]'],
+    'MPD-2': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:12])-[#6D3+0:5](-[#1D1+0:13])=[#6D3+0:6](-[#1D1+0:14])-[#6D3+0:7](-[#1D1+0:15])=[#6D3+0:8]-1-[#7D3+0:9](-[*:10])-[#1D1+0:16])-[#1D1+0:11]'],
+    # TMC (trimesoyl chloride)
+    'TMC-1': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[#17D1+0:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'],
+    'TMC-2': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
+    'TMC-3': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
 }

From 0e7749faba1916f5c82b92726ae24db26c51ce42 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 17:24:39 -0700
Subject: [PATCH 53/78] Added unit tests for MonomerGroup initialization and
 core properties

---
 .../tests/polymers/monomers/test_repr.py      | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 polymerist/tests/polymers/monomers/test_repr.py

diff --git a/polymerist/tests/polymers/monomers/test_repr.py b/polymerist/tests/polymers/monomers/test_repr.py
new file mode 100644
index 0000000..3c90e43
--- /dev/null
+++ b/polymerist/tests/polymers/monomers/test_repr.py
@@ -0,0 +1,147 @@
+'''Tests that collections of monomer fragments are treated as expected'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+from typing import Any
+
+import pytest
+
+from ..import PEG_PLGA_FRAGMENTS, MPD_TMC_FRAGMENTS
+from polymerist.polymers.monomers.repr import MonomerGroup
+
+
+@pytest.fixture(scope='function') # want to re-initialize for each test function to avoid cross-contamination
+def monogrp_peg_plga() ->  MonomerGroup:
+    return MonomerGroup(monomers=PEG_PLGA_FRAGMENTS)
+
+@pytest.fixture(scope='function') # want to re-initialize for each test function to avoid cross-contamination
+def monogrp_mpd_tmc() ->  MonomerGroup:
+    return MonomerGroup(monomers=MPD_TMC_FRAGMENTS)
+
+# Testing all routes to initialization
+@pytest.mark.parametrize(
+    'monomers',
+    [
+        { # nominal test case
+            'PGA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]'],
+            'PGA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]'],
+            'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'],
+        },
+        { # test that list closure autofill works
+            'PGA-1A': '[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]',
+            'PGA-1B': '[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]',
+            'PGA-2': '[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]',
+        },
+        # XFAILS: test that the initializer rejects...
+        pytest.param(
+            { # ...1) non-string like objects
+                'foo' : 42.0,
+                'bar' : True,
+            },
+            marks=pytest.mark.xfail(
+                raises=TypeError,
+                reason='Monomer fragment inputs are not stringlike',
+                strict=True,
+            ),
+        ),
+        pytest.param(
+            { #  1a) more subtly, list OF CONTAINERS of valid SMARTS are still invalid
+                'PGA-1A': [( 
+                    '[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]',
+                    '[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]'
+                )],
+                'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'],
+            },
+            marks=pytest.mark.xfail(
+                raises=TypeError,
+                reason='Monomer fragment inputs are not stringlike',
+                strict=True,
+            ),
+        ),
+        pytest.param(
+            { # ...2) empty lists
+                'PGA-1A': [],
+                'PGA-2' : [],
+            },
+            marks=pytest.mark.xfail(
+                raises=IndexError,
+                reason='At least one monomer fragment input is empty',
+                strict=True,
+            ),
+        ),
+        pytest.param(
+            { # ...3) non-empty strings which are nevertheless invalid SMARTS 
+             #- NOTE: empty strings, perhaps surprisingly, actually ARE valid as SMARTS and therefore aren't xfail tested here
+                'fake-1': ['this is a bogus SMARTS'],
+                'invalid-2': ['so_is_this'],
+            },
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+                reason='At least one monomer fragment input is not valid a SMARTS string',
+                strict=True,
+            ),
+        ),
+        pytest.param(
+            { # ...3a) this one is very subtle, but SMARTS with slight errors which invalidate themas SMARTS should also fail
+                'PGA-1A': ['[OH]CD(=O)*'], # fat-finger mistake, "D" should be "C"
+                'PGA-2': ['*OCC(+O)*'], # forgot to hit shift when typing double bond
+            },
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+                reason='At least one monomer fragment input is not valid a SMARTS string',
+                strict=True,
+            ),
+        ),
+    ]
+)
+def test_monogrp_init(monomers : dict[str, Any]) -> None:
+    '''Check that the MonomerGroup initializer handles valid inputs as expected and reject invalid inputs'''
+    _ = MonomerGroup(monomers=monomers) # no assert needed, just checking when initialization completes
+    
+# Testing properties of contained monomers
+@pytest.mark.parametrize(
+    'monogrp, expected_is_linear',
+    [
+        ('monogrp_peg_plga', True),
+        ('monogrp_mpd_tmc', False),
+    ],
+)
+def test_monogrp_linearity(monogrp : MonomerGroup, expected_is_linear : bool, request : pytest.FixtureRequest) -> None:
+    '''Test whether branched and unbranched chain fragment detection behaves as expected'''
+    monogrp = request.getfixturevalue(monogrp) # unpack fixtures into their respective values
+    assert monogrp.is_linear == expected_is_linear
+    
+@pytest.mark.parametrize(
+    'monogrp, expected_counts',
+    [
+        ('monogrp_peg_plga', (3, 6)),
+        ('monogrp_mpd_tmc', (3, 2)),
+    ],
+)
+def test_monogrp_mid_and_term_counts(monogrp : MonomerGroup, expected_counts : tuple[int, int], request : pytest.FixtureRequest) -> None:
+    '''Test whether middle and terminal monomers are counted correctly'''
+    monogrp = request.getfixturevalue(monogrp) # unpack fixtures into their respective values
+    assert monogrp.num_mid_and_term == expected_counts
+    
+# Testing end group determination
+@pytest.mark.parametrize(
+    'monogrp, term_orient, expected_end_groups',
+    [
+        ('monogrp_peg_plga', {}, {'head' : 'PEG-1A', 'tail' : 'PEG-1B'}), # test autogeneration from first 2 when
+        ('monogrp_mpd_tmc', (3, 2)),
+    ],
+)
+def test_monogrp_end_groups(monogrp : MonomerGroup, term_orient : dict[str, str], expected_end_groups : dict[str, str], request : pytest.FixtureRequest) -> None:
+    '''Test whether procedural end group determination'''
+    monogrp = request.getfixturevalue(monogrp) # unpack fixtures into their respective values
+    monogrp.term_orient = term_orient
+    
+    end_group_catalogue = monogrp.linear_end_groups()
+    end_group_names = {
+        head_or_tail : resname  # drop RDKit Mol for check (Mol object is harder to validate, use bound name as proxy)
+            for head_or_tail, (resname, _) in end_group_catalogue.items()
+    }
+    
+    assert end_group_names == expected_end_groups
+    
\ No newline at end of file

From e33ca3d0d8b914eb6aa65cded2734cc9e8bad8c5 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 17:28:52 -0700
Subject: [PATCH 54/78] Added polyethylene example to test when fewer than the
 max 2 end group templates are present

---
 polymerist/tests/polymers/__init__.py         | 27 ++++++++++++-------
 .../tests/polymers/monomers/test_repr.py      | 18 ++++++++++---
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/polymerist/tests/polymers/__init__.py b/polymerist/tests/polymers/__init__.py
index a5126d8..d0ddb85 100644
--- a/polymerist/tests/polymers/__init__.py
+++ b/polymerist/tests/polymers/__init__.py
@@ -3,6 +3,23 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
+
+PE_FRAGMENTS : dict[str, list[str]] = {
+    # PE (polyethylene)
+    'PE1': ['[*:1]-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:6])(-[#1D1+0:7])-[#1D1+0:8])(-[#1D1+0:4])-[#1D1+0:5]'],
+    'PE2': ['[*:1]-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:7])-[#1D1+0:8])(-[#1D1+0:5])-[#1D1+0:6]'],
+}
+
+MPD_TMC_FRAGMENTS : dict[str, list[str]] = { # fragments for common polyamide membrane
+    # MPD (m-phenyl diamine)
+    'MPD-1': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:11])-[#6D3+0:5](-[#1D1+0:12])=[#6D3+0:6](-[#1D1+0:13])-[#6D3+0:7](-[#1D1+0:14])=[#6D3+0:8]-1-[#7D3+0:9](-[#1D1+0:15])-[#1D1+0:16])-[#1D1+0:10]'],
+    'MPD-2': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:12])-[#6D3+0:5](-[#1D1+0:13])=[#6D3+0:6](-[#1D1+0:14])-[#6D3+0:7](-[#1D1+0:15])=[#6D3+0:8]-1-[#7D3+0:9](-[*:10])-[#1D1+0:16])-[#1D1+0:11]'],
+    # TMC (trimesoyl chloride)
+    'TMC-1': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[#17D1+0:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'],
+    'TMC-2': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
+    'TMC-3': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
+}
+
 PEG_PLGA_FRAGMENTS : dict[str, list[str]] = { # fragments for all variants of PEG-PLGA-like polymers
     # PEG (ethylene glycol)
     'PEG-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7])-[#1D1+0:5]'],
@@ -17,13 +34,3 @@
     'PGA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]'],
     'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'],
 }
-
-MPD_TMC_FRAGMENTS : dict[str, list[str]] = { # fragments for common polyamide membrane
-    # MPD (m-phenyl diamine)
-    'MPD-1': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:11])-[#6D3+0:5](-[#1D1+0:12])=[#6D3+0:6](-[#1D1+0:13])-[#6D3+0:7](-[#1D1+0:14])=[#6D3+0:8]-1-[#7D3+0:9](-[#1D1+0:15])-[#1D1+0:16])-[#1D1+0:10]'],
-    'MPD-2': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:12])-[#6D3+0:5](-[#1D1+0:13])=[#6D3+0:6](-[#1D1+0:14])-[#6D3+0:7](-[#1D1+0:15])=[#6D3+0:8]-1-[#7D3+0:9](-[*:10])-[#1D1+0:16])-[#1D1+0:11]'],
-    # TMC (trimesoyl chloride)
-    'TMC-1': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[#17D1+0:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'],
-    'TMC-2': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
-    'TMC-3': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
-}
diff --git a/polymerist/tests/polymers/monomers/test_repr.py b/polymerist/tests/polymers/monomers/test_repr.py
index 3c90e43..4b5a9ae 100644
--- a/polymerist/tests/polymers/monomers/test_repr.py
+++ b/polymerist/tests/polymers/monomers/test_repr.py
@@ -7,18 +7,28 @@
 
 import pytest
 
-from ..import PEG_PLGA_FRAGMENTS, MPD_TMC_FRAGMENTS
+from ..import PE_FRAGMENTS, MPD_TMC_FRAGMENTS, PEG_PLGA_FRAGMENTS
 from polymerist.polymers.monomers.repr import MonomerGroup
 
 
+# Example fragments groups
 @pytest.fixture(scope='function') # want to re-initialize for each test function to avoid cross-contamination
-def monogrp_peg_plga() ->  MonomerGroup:
-    return MonomerGroup(monomers=PEG_PLGA_FRAGMENTS)
+def monogrp_degenerate() ->  MonomerGroup:
+    return MonomerGroup(monomers={})
 
-@pytest.fixture(scope='function') # want to re-initialize for each test function to avoid cross-contamination
+@pytest.fixture(scope='function')
+def monogrp_polyethylene() ->  MonomerGroup:
+    return MonomerGroup(monomers=PE_FRAGMENTS)
+
+@pytest.fixture(scope='function')
 def monogrp_mpd_tmc() ->  MonomerGroup:
     return MonomerGroup(monomers=MPD_TMC_FRAGMENTS)
 
+@pytest.fixture(scope='function')
+def monogrp_peg_plga() ->  MonomerGroup:
+    return MonomerGroup(monomers=PEG_PLGA_FRAGMENTS)
+
+
 # Testing all routes to initialization
 @pytest.mark.parametrize(
     'monomers',

From 0074e4a8f2ced05bfd8e9127d34125166c59fff2 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 17:43:03 -0700
Subject: [PATCH 55/78] Wrote unit test for end group identification

---
 .../tests/polymers/monomers/test_repr.py      | 45 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/polymerist/tests/polymers/monomers/test_repr.py b/polymerist/tests/polymers/monomers/test_repr.py
index 4b5a9ae..2433f76 100644
--- a/polymerist/tests/polymers/monomers/test_repr.py
+++ b/polymerist/tests/polymers/monomers/test_repr.py
@@ -138,8 +138,49 @@ def test_monogrp_mid_and_term_counts(monogrp : MonomerGroup, expected_counts : t
 @pytest.mark.parametrize(
     'monogrp, term_orient, expected_end_groups',
     [
-        ('monogrp_peg_plga', {}, {'head' : 'PEG-1A', 'tail' : 'PEG-1B'}), # test autogeneration from first 2 when
-        ('monogrp_mpd_tmc', (3, 2)),
+        # 1) test autogeneration of orientations when...
+        ( # ...term orientation is unspecified but can be completed for both ends (i.e. at least 2 terminal monomers are available)
+            'monogrp_peg_plga',
+            {}, 
+            {'head' : 'PEG-1A', 'tail' : 'PEG-1B'},
+        ), 
+        ( # ...term orientation is unspecified and can only be partially completed (i.e. fewer than 2 terminal monomers are available)
+            'monogrp_polyethylene',
+            {}, 
+            {'head' : 'PE1'},
+        ), 
+        ( # ...term orientation is unspecified but can be completed for both ends (i.e. at least 2 terminal monomers are available)
+            'monogrp_peg_plga',
+            {}, 
+            {'head' : 'PEG-1A', 'tail' : 'PEG-1B'},
+        ), 
+        # 2) test end group identification for correctly-specified term orientation
+        ( # test nominal case
+            'monogrp_peg_plga',
+            {'head' : 'PGA-1A', 'tail' : 'PEG-1B'},
+            {'head' : 'PGA-1A', 'tail' : 'PEG-1B'},
+        ), 
+        ( # test that duplication works as expected
+            'monogrp_polyethylene',
+            {'head' : 'PE1', 'tail' : 'PE1'},
+            {'head' : 'PE1', 'tail' : 'PE1'},
+        ), 
+        # 3) test incorrect specifications
+        ( # specification without "head"/"tail" keys will not fail, but WILL default to auto-gen
+            'monogrp_peg_plga',
+            {'first' : 'PGA-1A', 'second' : 'PEG-1B'},
+            {'head' : 'PEG-1A', 'tail' : 'PEG-1B'},
+        ), 
+        pytest.param( # specification with invalid monomer names (i.e. keys not in the "monomers" dict) should raise outright error
+            'monogrp_peg_plga',
+            {'head' : 'PGG-2C', 'tail' : 'BOGUS'},
+            None,
+            marks=pytest.mark.xfail(
+                raises=KeyError,
+                reason='Term group names specified don;t existing within the monomer fragments defined',
+                strict=True,
+            )
+        ),
     ],
 )
 def test_monogrp_end_groups(monogrp : MonomerGroup, term_orient : dict[str, str], expected_end_groups : dict[str, str], request : pytest.FixtureRequest) -> None:

From f27263fe7e657bba00c13647bf2f1b78955069b6 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 19:56:03 -0700
Subject: [PATCH 56/78] Expanded syntax and support for addition/validation of
 new monomer SMARTS into MonomerGroup

---
 polymerist/polymers/monomers/repr.py          | 71 ++++++++++++++-----
 .../tests/polymers/monomers/test_repr.py      | 13 +---
 2 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index 6003228..3cdc225 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -6,7 +6,7 @@
 import logging
 LOGGER = logging.getLogger(__name__)
 
-from typing import Generator, Optional, Union
+from typing import Generator, Optional, Iterable, Union
 from dataclasses import dataclass, field
 
 from itertools import cycle
@@ -31,36 +31,62 @@ class MonomerGroup:
 
     def __post_init__(self) -> None:
         # Encase bare SMARTS into lists and check that all monomer SMARTS are valid
-        for resname, smarts_seq in self.monomers.items():
-            if isinstance(smarts_seq, list):
-                if not smarts_seq:
-                    raise IndexError(f'Empty monomer declaration for "{resname}"') # catch case where empty list if provided (would slip through subsequent checks otherwise)
-                smarts_list = smarts_seq # no modification needed
-            elif isinstance(smarts_seq, str):
-                LOGGER.warning(f'Wrapping bare monomer SMARTS in list to comply with spec (storing as ["{smarts_seq}"])')
-                smarts_list = [smarts_seq] # wrap lone SMARTS string in list
-                self.monomers[resname] = smarts_list # update value internally (doesn't change size of dict)
-            else:
-                raise TypeError(f'Values of monomers must be either SMARTS strings or lists of SMARTS strings, not "{type(smarts_seq).__name__}"')
-            
-            # check that all SMARTS are valid
-            for i, smarts in enumerate(smarts_list): # we can now be sure that this is a list of SMARTS strings
-                if not is_valid_SMARTS(smarts):
-                    raise ValueError(f'Provided invalid monomer SMARTS string for {resname}[{i}]: "{smarts}"')      
-                # DEV: decide whether or not SMILES expansion and spec-compliance should be enforced here or shunted off to the user 
+        monomers_init = self.monomers # store inputted values
+        self.monomers = {} # clear monomers and re-add one-at-a-time
+        for resname, smarts in monomers_init.items():
+            self.add_monomer(resname, smarts)
         # DEV: opted to forgo term_orient check for now, as modifying this violates the read-only data model aimed for here
                 
+    # ATTRIBUTE PROPERTIES AND ALIASES
     @staticmethod
     def is_terminal(monomer : Chem.Mol) -> bool:
         '''Determine whether or not a monomer is terminal'''
         return get_num_ports(monomer) == 1
+    
+    def _add_monomer(self, resname : str, smarts : Smarts) -> None:
+        '''Add a new monomer to the templates already stored within, subject to validation checks'''
+        if not isinstance(smarts, str): 
+            raise TypeError(f'Values of monomers must be either SMARTS strings or lists of SMARTS strings, not "{type(smarts).__name__}"')
+        # DEV: include check for empty string? (technically still a valid SMARTS string, but a pretty pathological one at that)
+        if not is_valid_SMARTS(smarts):
+            raise ValueError(f'Provided invalid monomer SMARTS string for {resname}: "{smarts}"') 
+        # DEV: decide whether or not SMILES expansion and spec-compliance should be enforced here or shunted off to the user 
+        
+        if resname in self.monomers:
+            existing_resgroup = self.monomers[resname]
+            if isinstance(existing_resgroup, list) and (smarts not in existing_resgroup):
+                LOGGER.info(f'Extending existing residue category "{resname}" with SMARTS {smarts}')
+                self.monomers[resname].append(smarts)
+        else:
+            LOGGER.info(f'Creating new residue category "{resname}", containing singular SMARTS ["{smarts}"])')
+            self.monomers[resname] = [smarts]
+            
+    def _add_monomers(self, resname : str, smarts_container : Iterable[Smarts]) -> None:
+        '''Add new monomers to the templates already stored within, subject to validation checks, from an iterable container'''
+        for smarts in smarts_container:
+            self._add_monomer(resname, smarts)
+    
+    def add_monomer(self, resname : str, smarts : Union[Smarts, Iterable[Smarts]]) -> None:
+        '''Register new monomers, either directly from SMARTS or from a container of SMARTS'''
+        if isinstance(smarts, Iterable) and not isinstance(smarts, str): # don;t want to insert one character at a time if a string is in fact provided
+            self._add_monomers(resname, smarts)
+        else:
+            self._add_monomer(resname, smarts) # assume any other inputs are singular values or strings 
+    
+    def __getitem__(self, resname : str) -> str:
+        '''Convenience method to access .monomers directly from instance'''
+        return self.monomers[resname] # NOTE: deliberately avoid "get()" here to propagate KeyError
 
-    # ATTRIBUTE PROPERTIES AND ALIASES
+    def __setitem__(self, resname : str, smarts : Smarts) -> str:
+        '''Convenience method to access .monomers directly from instance'''
+        self.add_monomer(resname, smarts)
+    
     @property
     def SMARTS(self) -> dict[str, list[Smarts]]:
         '''Alias of legacy "monomers" attribute'''
         return self.monomers # alias of legacy name for convenience
     
+    # ITERATION OVER STORED MOLECULE FRAGMENTS
     def iter_rdmols(self, term_only : Optional[bool]=None) -> Generator[tuple[str, Chem.Mol], None, None]:
         '''
         Generate (residue name, RDKit Mol) pairs of all monomers present
@@ -92,6 +118,13 @@ def rdmols(self, term_only : Optional[bool]=None) -> dict[str, list[Chem.Mol]]:
 
         return rdmol_dict
     
+    def contributions(self, term_only : Optional[bool]=None) -> dict[str, list[int]]:
+        '''Returns dict of the number of real (i.e. non-linker) atoms in each residue list'''
+        return {
+            resname : [mol.GetNumAtoms() - get_num_ports(mol) for mol in mol_list]
+                for resname, mol_list in self.rdmols(term_only=term_only).items()
+        }
+    
     @property
     def n_monomers(self) -> int:
         '''Returns number of present monomers; multiple monomers under the same residue name are considered distinct'''
diff --git a/polymerist/tests/polymers/monomers/test_repr.py b/polymerist/tests/polymers/monomers/test_repr.py
index 2433f76..8c6d462 100644
--- a/polymerist/tests/polymers/monomers/test_repr.py
+++ b/polymerist/tests/polymers/monomers/test_repr.py
@@ -69,17 +69,6 @@ def monogrp_peg_plga() ->  MonomerGroup:
                 strict=True,
             ),
         ),
-        pytest.param(
-            { # ...2) empty lists
-                'PGA-1A': [],
-                'PGA-2' : [],
-            },
-            marks=pytest.mark.xfail(
-                raises=IndexError,
-                reason='At least one monomer fragment input is empty',
-                strict=True,
-            ),
-        ),
         pytest.param(
             { # ...3) non-empty strings which are nevertheless invalid SMARTS 
              #- NOTE: empty strings, perhaps surprisingly, actually ARE valid as SMARTS and therefore aren't xfail tested here
@@ -93,7 +82,7 @@ def monogrp_peg_plga() ->  MonomerGroup:
             ),
         ),
         pytest.param(
-            { # ...3a) this one is very subtle, but SMARTS with slight errors which invalidate themas SMARTS should also fail
+            { # ...3a) this one is very subtle, but SMARTS with slight errors which invalidate them as SMARTS should also fail
                 'PGA-1A': ['[OH]CD(=O)*'], # fat-finger mistake, "D" should be "C"
                 'PGA-2': ['*OCC(+O)*'], # forgot to hit shift when typing double bond
             },

From 10b8b8c3da65a73e61be919be7279218cb66d8bf Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 20:01:36 -0700
Subject: [PATCH 57/78] Added bug note for validation skipping when accessing
 monomer attributes directly

---
 polymerist/polymers/monomers/repr.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index 3cdc225..505ff70 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -76,6 +76,8 @@ def add_monomer(self, resname : str, smarts : Union[Smarts, Iterable[Smarts]]) -
     def __getitem__(self, resname : str) -> str:
         '''Convenience method to access .monomers directly from instance'''
         return self.monomers[resname] # NOTE: deliberately avoid "get()" here to propagate KeyError
+        # BUG: user can directly append to the returned value to forgo monomer validation checks;
+        # this is not unit to __getitem__ but rather a consequence of thinly-wrapping builtin types
 
     def __setitem__(self, resname : str, smarts : Smarts) -> str:
         '''Convenience method to access .monomers directly from instance'''

From 30642dccb6c4edaee4b5ff034124d261d6497b13 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 20:02:57 -0700
Subject: [PATCH 58/78] Added test for degenerate eng group autoassignment
 (i.e. when NO terminal monomers are present)

---
 polymerist/tests/polymers/monomers/test_repr.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/polymerist/tests/polymers/monomers/test_repr.py b/polymerist/tests/polymers/monomers/test_repr.py
index 8c6d462..ee58435 100644
--- a/polymerist/tests/polymers/monomers/test_repr.py
+++ b/polymerist/tests/polymers/monomers/test_repr.py
@@ -138,6 +138,11 @@ def test_monogrp_mid_and_term_counts(monogrp : MonomerGroup, expected_counts : t
             {}, 
             {'head' : 'PE1'},
         ), 
+        ( # ...term orientation is unspecified and no end monomers are available for auto-assignment
+            'monogrp_degenerate',
+            {}, 
+            {},
+        ), 
         ( # ...term orientation is unspecified but can be completed for both ends (i.e. at least 2 terminal monomers are available)
             'monogrp_peg_plga',
             {}, 

From bda08d844339e1870cf3b2c96f2512cb7b8cc342 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 21:17:48 -0700
Subject: [PATCH 59/78] Attempted (unsuccessfully) to get __hash__ working for
 MonomerGroup

---
 polymerist/polymers/monomers/repr.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index 505ff70..73f581b 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -29,6 +29,7 @@ class MonomerGroup:
     monomers : dict[str, Union[Smarts, list[Smarts]]] = field(default_factory=dict)
     term_orient : dict[str, str] = field(default_factory=dict) # keys are either "head" or "tail", values are the names of residues in "monomers"
 
+    # MONOMER ADDITION AND VALIDATION
     def __post_init__(self) -> None:
         # Encase bare SMARTS into lists and check that all monomer SMARTS are valid
         monomers_init = self.monomers # store inputted values
@@ -37,12 +38,6 @@ def __post_init__(self) -> None:
             self.add_monomer(resname, smarts)
         # DEV: opted to forgo term_orient check for now, as modifying this violates the read-only data model aimed for here
                 
-    # ATTRIBUTE PROPERTIES AND ALIASES
-    @staticmethod
-    def is_terminal(monomer : Chem.Mol) -> bool:
-        '''Determine whether or not a monomer is terminal'''
-        return get_num_ports(monomer) == 1
-    
     def _add_monomer(self, resname : str, smarts : Smarts) -> None:
         '''Add a new monomer to the templates already stored within, subject to validation checks'''
         if not isinstance(smarts, str): 
@@ -73,6 +68,7 @@ def add_monomer(self, resname : str, smarts : Union[Smarts, Iterable[Smarts]]) -
         else:
             self._add_monomer(resname, smarts) # assume any other inputs are singular values or strings 
     
+    # DUNDER "MAGIC" METHODS
     def __getitem__(self, resname : str) -> str:
         '''Convenience method to access .monomers directly from instance'''
         return self.monomers[resname] # NOTE: deliberately avoid "get()" here to propagate KeyError
@@ -82,6 +78,17 @@ def __getitem__(self, resname : str) -> str:
     def __setitem__(self, resname : str, smarts : Smarts) -> str:
         '''Convenience method to access .monomers directly from instance'''
         self.add_monomer(resname, smarts)
+        
+    def __hash__(self) -> int:
+        '''Hash based on monomer SMARTS and terminal orientation in a canonical order'''
+        # TOSELF: this is far from bulletproof, viz. canonicalzation of SMARTS, list value sorting, etc
+        return hash(f'{sorted(self.monomers.items())}{sorted(self.term_orient.items())}')
+    
+    # ATTRIBUTE PROPERTIES AND ALIASES
+    @staticmethod
+    def is_terminal(monomer : Chem.Mol) -> bool:
+        '''Determine whether or not a monomer is terminal'''
+        return get_num_ports(monomer) == 1
     
     @property
     def SMARTS(self) -> dict[str, list[Smarts]]:

From 32f8d534f583f1c0d9d90aec1eba8ac2251b6cef Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Mon, 9 Dec 2024 21:18:43 -0700
Subject: [PATCH 60/78] Wrote unit tests for linear polymer builder

---
 .../tests/polymers/building/test_linear.py    | 132 +++++++++++++++++-
 1 file changed, 126 insertions(+), 6 deletions(-)

diff --git a/polymerist/tests/polymers/building/test_linear.py b/polymerist/tests/polymers/building/test_linear.py
index f9f4721..1c06206 100644
--- a/polymerist/tests/polymers/building/test_linear.py
+++ b/polymerist/tests/polymers/building/test_linear.py
@@ -6,11 +6,131 @@
 import pytest
 from pathlib import Path
 
-from polymerist.genutils.importutils.pkginspect import get_file_path_within_package
-from polymerist.tests import data as testdata
+from .. import PE_FRAGMENTS, MPD_TMC_FRAGMENTS, PEG_PLGA_FRAGMENTS
 
-from polymerist.polymers import building
+from collections import Counter
 
-@pytest.fixture
-def fragments_path() -> Path:
-    return get_file_path_within_package('peg-pla-pga.json', testdata)
\ No newline at end of file
+from polymerist.polymers.building import build_linear_polymer
+from polymerist.polymers.monomers.repr import MonomerGroup
+from polymerist.polymers.exceptions import MorphologyError, PartialBlockSequence, EmptyBlockSequence
+
+
+@pytest.fixture(scope='function')
+def monogrp_polyethylene() ->  MonomerGroup:
+    return MonomerGroup(monomers=PE_FRAGMENTS)
+
+@pytest.fixture(scope='function')
+def monogrp_mpd_tmc() ->  MonomerGroup:
+    return MonomerGroup(monomers=MPD_TMC_FRAGMENTS)
+
+@pytest.fixture(scope='function')
+def monogrp_peg_plga() ->  MonomerGroup:
+    return MonomerGroup(monomers=PEG_PLGA_FRAGMENTS)
+
+
+@pytest.mark.parametrize(
+    'monomers, term_orient, n_monomers, sequence, minimize_sequence, allow_partial_sequences, energy_minimize',
+    [
+        # Polyethylene
+        ('monogrp_polyethylene', {}, 7, 'A', True, True, False), # test end group autogen (should only have 1 term group)
+        ('monogrp_polyethylene', {'head':'PE1', 'tail' : 'PE1'}, 7, 'A', True, True, False), # test explicit head-tail (result here should be different from autogen structure)
+        ('monogrp_polyethylene', {'head':'PE1', 'tail' : 'PE1'}, 7, 'A', True, False, False), # test partial sequences (irrelevant here)
+        ('monogrp_polyethylene', {'head':'PE1', 'tail' : 'PE1'}, 7, 'A', True, False, True), # test energy minimization doesn't crash
+        pytest.param(  # will fail due to too few monomers for given sequence - 
+            'monogrp_polyethylene', {}, 7, '', True, True, False, # NOTE: need to have partials enabled, since failure happens ONLY once sequence is passed to mbuild
+            marks=pytest.mark.xfail(
+                raises=EmptyBlockSequence,
+                reason='Sequence provided must be nonempty',
+                strict=True,
+            )
+        ),
+        pytest.param(  # will fail due to too few monomers for given sequence - 
+            'monogrp_polyethylene', {'head':'PE1', 'tail' : 'PE1'}, 7, 'AB', True, True, False, # NOTE: need to have partials enabled, since failure happens ONLY once sequence is passed to mbuild
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+                reason='Fewer unique monomers defined than called for by target sequence',
+                strict=True,
+            )
+        ),
+        # MPD-TMC
+        ('monogrp_mpd_tmc', {'head':'MPD-1', 'tail' : 'TMC-1'}, 8, 'A', True, True, False), # correctly-specified: explicit end groups, only linear middle monomers, and whole number of sequence repeats
+        pytest.param(
+            'monogrp_mpd_tmc', {'head':'MPD-1', 'tail' : 'TMC-1'}, 7, 'AB', True, False, False, # will fail due to partial sequence
+            marks=pytest.mark.xfail(
+                raises=PartialBlockSequence,
+                reason='Partial sequence repeat needed to get odd number block out of AB, but partial blocks are disabled',
+                strict=True,
+            )
+        ),
+        pytest.param(
+            'monogrp_mpd_tmc', {'head':'MPD-1', 'tail' : 'TMC-1'}, 8, 'AB', True, True, False, # will fail due to 3-functional TMC middle monomer as B
+            marks=pytest.mark.xfail(
+                raises=MorphologyError,
+                reason='One of the monomers requested is non-linear (3-functional)',
+                strict=True,
+            )
+        ),
+        # PEG-PLGA
+        ('monogrp_peg_plga', {}, 15, 'ABC', True, True, False), # test autogen
+        ('monogrp_peg_plga', {}, 17, 'ABC', True, False, False), # test autogen with whole sequence
+        ('monogrp_peg_plga', {'head':'PGA-1A', 'tail' : 'PGA-1B'}, 15, 'ABC', True, True, False), # test more complex sequence with non-default explicit end groups
+        pytest.param(
+            'monogrp_peg_plga', {'head':'PGA-1A', 'tail' : 'PGA-1B'}, 15, 'ABC', True, False, False, # will fail due to partial
+            marks=pytest.mark.xfail(
+                raises=PartialBlockSequence,
+                reason='Partial sequence repeat needed to get odd number block out of AB, but partial blocks are disabled',
+                strict=True,
+            )
+        ),
+        ('monogrp_peg_plga', {}, 40, 'ABCB', True, True, True), # test longer energy min
+    ]
+)
+def test_build_linear_polymer(
+        monomers : MonomerGroup,
+        term_orient : dict[str, str],
+        n_monomers : int,
+        sequence : str,
+        minimize_sequence : bool,
+        allow_partial_sequences : bool,
+        energy_minimize : bool,
+        request : pytest.FixtureRequest, # allows for fixture expansion in parameterized arguments
+    ) -> None:
+    '''Test linear polymer builder behavior under varing sets of parameters'''
+    monomers = request.getfixturevalue(monomers) # unpack fixtures into their respective values
+    monomers.term_orient = term_orient # this edit makes it VITAL that fixtures be function-level
+    
+    polymer = build_linear_polymer(
+        monomers=monomers,
+        n_monomers=n_monomers,
+        sequence=sequence,
+        minimize_sequence=minimize_sequence,
+        allow_partial_sequences=allow_partial_sequences,
+        add_Hs=False,
+        energy_minimize=energy_minimize,
+    )
+    
+    # characterize middle monomers
+    n_rep_units = len(polymer.children)
+    residue_sizes : dict[str, int] = {}
+    residue_counts = Counter() # TODO: make use of this for checks!!
+    for middle_monomers in polymer.children:
+        residue_sizes[middle_monomers.name] = middle_monomers.n_particles
+        residue_counts[middle_monomers.name] += 1
+        
+    # characterize end groups
+    end_groups_requested = set(resname for head_or_tail, (resname, mol) in monomers.linear_end_groups().items())
+    end_groups_used = set()
+    for end_group in polymer.end_groups:
+        if end_group is not None:
+            end_groups_used.add(end_group.name)
+            residue_sizes[end_group.name] = end_group.n_particles
+            residue_counts[middle_monomers.name] += 1
+    
+    total_reps_match = (n_rep_units == n_monomers)
+    contribs_match = all(num_monomers == monomers.contributions()[resname][0]
+        for resname, num_monomers in residue_sizes.items()
+    )
+    end_groups_correct = (end_groups_used == end_groups_requested)
+    # counts_match = ...
+    
+    assert all([total_reps_match, contribs_match, end_groups_correct]) #, and counts_match    )
\ No newline at end of file

From 4e3148875a11da8d18666545cb161eec286ccd8b Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 10 Dec 2024 15:05:20 -0700
Subject: [PATCH 61/78] Fixed indent on openff_topology_to_openmm() arguments

---
 polymerist/mdtools/openfftools/omminter/mdobjects.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/polymerist/mdtools/openfftools/omminter/mdobjects.py b/polymerist/mdtools/openfftools/omminter/mdobjects.py
index 2ed2be9..f7dc455 100644
--- a/polymerist/mdtools/openfftools/omminter/mdobjects.py
+++ b/polymerist/mdtools/openfftools/omminter/mdobjects.py
@@ -39,8 +39,13 @@ def forcefield_flexible(forcefield : Union[ForceField, str, Path]) -> ForceField
 
         return ForceField(ff_path)
 
-def openff_topology_to_openmm(offtop : OFFTopology, forcefield : Union[ForceField, str, Path], box_vecs : Optional[Union[VectorQuantity, BoxVectorsQuantity]]=None,
-                              combine_nonbonded_forces : bool=False, add_constrained_forces : bool=False) -> tuple[OMMTopology, System, Quantity]:
+def openff_topology_to_openmm(
+            offtop : OFFTopology,
+            forcefield : Union[ForceField, str, Path],
+            box_vecs : Optional[Union[VectorQuantity, BoxVectorsQuantity]]=None,
+            combine_nonbonded_forces : bool=False,
+            add_constrained_forces : bool=False
+        ) -> tuple[OMMTopology, System, Quantity]:
     '''Converts an OpenFF Topology to an OpenMM Topology, System, and Positions'''
     if box_vecs is not None:
         offtop.box_vectors = box_vectors_flexible(box_vecs)

From 051d128a012aaf8b9359974455f1c120b598b871 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 10 Dec 2024 16:29:56 -0700
Subject: [PATCH 62/78] Removed deprecated local TKREGS import

---
 polymerist/mdtools/openfftools/solvation/solvents/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/polymerist/mdtools/openfftools/solvation/solvents/__init__.py b/polymerist/mdtools/openfftools/solvation/solvents/__init__.py
index f540da0..a11bf94 100644
--- a/polymerist/mdtools/openfftools/solvation/solvents/__init__.py
+++ b/polymerist/mdtools/openfftools/solvation/solvents/__init__.py
@@ -10,7 +10,6 @@
 from openff.units import unit as offunit
 
 from ... import topology
-from ... import TKREGS
 
 
 def generate_water_TIP3P() -> Molecule:

From e0cdfc583e443c8fcea7c0d25d0272037107e4a6 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 10 Dec 2024 16:30:11 -0700
Subject: [PATCH 63/78] Moved unitsys outside of omminter to resolve circular
 import

---
 polymerist/mdtools/openfftools/boxvectors.py             | 2 +-
 polymerist/mdtools/openfftools/omminter/__init__.py      | 6 ------
 polymerist/mdtools/openfftools/omminter/mdobjects.py     | 2 +-
 polymerist/mdtools/openfftools/solvation/physprops.py    | 2 +-
 polymerist/mdtools/openfftools/{omminter => }/unitsys.py | 0
 5 files changed, 3 insertions(+), 9 deletions(-)
 rename polymerist/mdtools/openfftools/{omminter => }/unitsys.py (100%)

diff --git a/polymerist/mdtools/openfftools/boxvectors.py b/polymerist/mdtools/openfftools/boxvectors.py
index 49ea028..7495d6a 100644
--- a/polymerist/mdtools/openfftools/boxvectors.py
+++ b/polymerist/mdtools/openfftools/boxvectors.py
@@ -14,7 +14,7 @@
 from openff.toolkit import Topology
 from openff.interchange.components._packmol import _box_vectors_are_in_reduced_form
 
-from .omminter.unitsys import allow_openmm_units, openff_to_openmm
+from .unitsys import allow_openmm_units, openff_to_openmm
 
 
 # CUSTOM TYPES FOR CLARITY, ESPECIALLY WITH UNITS
diff --git a/polymerist/mdtools/openfftools/omminter/__init__.py b/polymerist/mdtools/openfftools/omminter/__init__.py
index db12755..e1522de 100644
--- a/polymerist/mdtools/openfftools/omminter/__init__.py
+++ b/polymerist/mdtools/openfftools/omminter/__init__.py
@@ -4,9 +4,3 @@
 __email__ = 'timotej.bernat@colorado.edu'
 
 from .mdobjects import forcefield_flexible, openff_topology_to_openmm
-from .unitsys import (
-    openmm_to_openff,
-    openff_to_openmm, 
-    allow_openmm_units,
-    allow_openff_units,
-)
\ No newline at end of file
diff --git a/polymerist/mdtools/openfftools/omminter/mdobjects.py b/polymerist/mdtools/openfftools/omminter/mdobjects.py
index f7dc455..7f0af55 100644
--- a/polymerist/mdtools/openfftools/omminter/mdobjects.py
+++ b/polymerist/mdtools/openfftools/omminter/mdobjects.py
@@ -15,7 +15,7 @@
 from openmm.app import Topology as OMMTopology
 from openmm.unit import Quantity
 
-from .unitsys import openff_to_openmm
+from ..unitsys import openff_to_openmm
 from .. import FFDIR
 from ..boxvectors import box_vectors_flexible, VectorQuantity, BoxVectorsQuantity
 
diff --git a/polymerist/mdtools/openfftools/solvation/physprops.py b/polymerist/mdtools/openfftools/solvation/physprops.py
index 1de70a2..bd9f108 100644
--- a/polymerist/mdtools/openfftools/solvation/physprops.py
+++ b/polymerist/mdtools/openfftools/solvation/physprops.py
@@ -15,7 +15,7 @@
 from openff.units import Quantity as OFFQuantity
 
 from ....unitutils.dimensions import is_volume
-from ..omminter.unitsys import allow_openff_units, openff_to_openmm
+from ..unitsys import allow_openff_units, openff_to_openmm
 
 
 # MASS
diff --git a/polymerist/mdtools/openfftools/omminter/unitsys.py b/polymerist/mdtools/openfftools/unitsys.py
similarity index 100%
rename from polymerist/mdtools/openfftools/omminter/unitsys.py
rename to polymerist/mdtools/openfftools/unitsys.py

From f416a32088d2bf8880d27b3abc1840b7dee2822b Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 10 Dec 2024 17:19:45 -0700
Subject: [PATCH 64/78] Moved sample monomer fragment sets from unit tests to
 polymerist proper

---
 polymerist/polymers/monomers/__init__.py      |  7 +++-
 polymerist/polymers/monomers/fragments.py     | 32 +++++++++++++++++++
 polymerist/tests/polymers/__init__.py         | 32 -------------------
 .../tests/polymers/building/test_linear.py    |  7 ++--
 .../tests/polymers/monomers/test_repr.py      |  2 +-
 5 files changed, 42 insertions(+), 38 deletions(-)
 create mode 100644 polymerist/polymers/monomers/fragments.py

diff --git a/polymerist/polymers/monomers/__init__.py b/polymerist/polymers/monomers/__init__.py
index 176a2ff..7e50795 100644
--- a/polymerist/polymers/monomers/__init__.py
+++ b/polymerist/polymers/monomers/__init__.py
@@ -3,4 +3,9 @@
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
 
-from .repr import MonomerGroup # make monomer representation available at the module level
\ No newline at end of file
+from .repr import MonomerGroup # make monomer representation available at the module level
+from .fragments import (
+    PE_FRAGMENTS,
+    MPD_TMC_FRAGMENTS,
+    PEG_PLGA_FRAGMENTS,
+)
\ No newline at end of file
diff --git a/polymerist/polymers/monomers/fragments.py b/polymerist/polymers/monomers/fragments.py
new file mode 100644
index 0000000..a563d87
--- /dev/null
+++ b/polymerist/polymers/monomers/fragments.py
@@ -0,0 +1,32 @@
+'''Catalogue of monomer fragment templates for some common polymer systems'''
+
+PE_FRAGMENTS : dict[str, list[str]] = {
+    # PE (polyethylene)
+    'PE1': ['[*:1]-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:6])(-[#1D1+0:7])-[#1D1+0:8])(-[#1D1+0:4])-[#1D1+0:5]'],
+    'PE2': ['[*:1]-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:7])-[#1D1+0:8])(-[#1D1+0:5])-[#1D1+0:6]'],
+}
+
+MPD_TMC_FRAGMENTS : dict[str, list[str]] = { # fragments for common polyamide membrane
+    # MPD (m-phenyl diamine)
+    'MPD-1': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:11])-[#6D3+0:5](-[#1D1+0:12])=[#6D3+0:6](-[#1D1+0:13])-[#6D3+0:7](-[#1D1+0:14])=[#6D3+0:8]-1-[#7D3+0:9](-[#1D1+0:15])-[#1D1+0:16])-[#1D1+0:10]'],
+    'MPD-2': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:12])-[#6D3+0:5](-[#1D1+0:13])=[#6D3+0:6](-[#1D1+0:14])-[#6D3+0:7](-[#1D1+0:15])=[#6D3+0:8]-1-[#7D3+0:9](-[*:10])-[#1D1+0:16])-[#1D1+0:11]'],
+    # TMC (trimesoyl chloride)
+    'TMC-1': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[#17D1+0:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'],
+    'TMC-2': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
+    'TMC-3': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
+}
+
+PEG_PLGA_FRAGMENTS : dict[str, list[str]] = { # fragments for all variants of PEG-PLGA-like polymers
+    # PEG (ethylene glycol)
+    'PEG-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7])-[#1D1+0:5]'],
+    'PEG-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#8D2+0:5]-[#1D1+0:10])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'],
+    'PEG-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[*:5])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'],
+    # PLA (lactic acid)
+    'PLA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:4](=[#8D1+0:5])-[*:6])-[#1D1+0:8])-[#1D1+0:7]'],
+    'PLA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[#8D2+0:7]-[#1D1+0:12])-[#1D1+0:8]'],
+    'PLA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[*:7])-[#1D1+0:8]'],
+    # PGA (glycolic acid)
+    'PGA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]'],
+    'PGA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]'],
+    'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'],
+}
\ No newline at end of file
diff --git a/polymerist/tests/polymers/__init__.py b/polymerist/tests/polymers/__init__.py
index d0ddb85..f37a37e 100644
--- a/polymerist/tests/polymers/__init__.py
+++ b/polymerist/tests/polymers/__init__.py
@@ -2,35 +2,3 @@
 
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
-
-
-PE_FRAGMENTS : dict[str, list[str]] = {
-    # PE (polyethylene)
-    'PE1': ['[*:1]-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:6])(-[#1D1+0:7])-[#1D1+0:8])(-[#1D1+0:4])-[#1D1+0:5]'],
-    'PE2': ['[*:1]-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:7])-[#1D1+0:8])(-[#1D1+0:5])-[#1D1+0:6]'],
-}
-
-MPD_TMC_FRAGMENTS : dict[str, list[str]] = { # fragments for common polyamide membrane
-    # MPD (m-phenyl diamine)
-    'MPD-1': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:11])-[#6D3+0:5](-[#1D1+0:12])=[#6D3+0:6](-[#1D1+0:13])-[#6D3+0:7](-[#1D1+0:14])=[#6D3+0:8]-1-[#7D3+0:9](-[#1D1+0:15])-[#1D1+0:16])-[#1D1+0:10]'],
-    'MPD-2': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:12])-[#6D3+0:5](-[#1D1+0:13])=[#6D3+0:6](-[#1D1+0:14])-[#6D3+0:7](-[#1D1+0:15])=[#6D3+0:8]-1-[#7D3+0:9](-[*:10])-[#1D1+0:16])-[#1D1+0:11]'],
-    # TMC (trimesoyl chloride)
-    'TMC-1': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[#17D1+0:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'],
-    'TMC-2': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
-    'TMC-3': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
-}
-
-PEG_PLGA_FRAGMENTS : dict[str, list[str]] = { # fragments for all variants of PEG-PLGA-like polymers
-    # PEG (ethylene glycol)
-    'PEG-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[*:4])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7])-[#1D1+0:5]'],
-    'PEG-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#8D2+0:5]-[#1D1+0:10])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'],
-    'PEG-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[*:5])(-[#1D1+0:8])-[#1D1+0:9])(-[#1D1+0:6])-[#1D1+0:7]'],
-    # PLA (lactic acid)
-    'PLA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D4+0:3](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:4](=[#8D1+0:5])-[*:6])-[#1D1+0:8])-[#1D1+0:7]'],
-    'PLA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[#8D2+0:7]-[#1D1+0:12])-[#1D1+0:8]'],
-    'PLA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D4+0:4](-[#1D1+0:9])(-[#1D1+0:10])-[#1D1+0:11])(-[#6D3+0:5](=[#8D1+0:6])-[*:7])-[#1D1+0:8]'],
-    # PGA (glycolic acid)
-    'PGA-1A': ['[#8D2+0:1](-[#6D4+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])(-[#1D1+0:7])-[#1D1+0:8])-[#1D1+0:6]'],
-    'PGA-1B': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[#8D2+0:6]-[#1D1+0:9])(-[#1D1+0:7])-[#1D1+0:8]'],
-    'PGA-2': ['[*:1]-[#8D2+0:2]-[#6D4+0:3](-[#6D3+0:4](=[#8D1+0:5])-[*:6])(-[#1D1+0:7])-[#1D1+0:8]'],
-}
diff --git a/polymerist/tests/polymers/building/test_linear.py b/polymerist/tests/polymers/building/test_linear.py
index 1c06206..f9be153 100644
--- a/polymerist/tests/polymers/building/test_linear.py
+++ b/polymerist/tests/polymers/building/test_linear.py
@@ -4,14 +4,13 @@
 __email__ = 'timotej.bernat@colorado.edu'
 
 import pytest
-from pathlib import Path
-
-from .. import PE_FRAGMENTS, MPD_TMC_FRAGMENTS, PEG_PLGA_FRAGMENTS
 
 from collections import Counter
 
-from polymerist.polymers.building import build_linear_polymer
 from polymerist.polymers.monomers.repr import MonomerGroup
+from polymerist.polymers.monomers.fragments import PE_FRAGMENTS, MPD_TMC_FRAGMENTS, PEG_PLGA_FRAGMENTS
+
+from polymerist.polymers.building import build_linear_polymer
 from polymerist.polymers.exceptions import MorphologyError, PartialBlockSequence, EmptyBlockSequence
 
 
diff --git a/polymerist/tests/polymers/monomers/test_repr.py b/polymerist/tests/polymers/monomers/test_repr.py
index ee58435..bf93cdf 100644
--- a/polymerist/tests/polymers/monomers/test_repr.py
+++ b/polymerist/tests/polymers/monomers/test_repr.py
@@ -7,8 +7,8 @@
 
 import pytest
 
-from ..import PE_FRAGMENTS, MPD_TMC_FRAGMENTS, PEG_PLGA_FRAGMENTS
 from polymerist.polymers.monomers.repr import MonomerGroup
+from polymerist.polymers.monomers.fragments import PE_FRAGMENTS, MPD_TMC_FRAGMENTS, PEG_PLGA_FRAGMENTS
 
 
 # Example fragments groups

From a77090663adcb6af9f7fdbfa654a41ad2e6e81a0 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 10 Dec 2024 17:24:47 -0700
Subject: [PATCH 65/78] Corrected typo in end group autogen warning

---
 polymerist/polymers/monomers/repr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polymerist/polymers/monomers/repr.py b/polymerist/polymers/monomers/repr.py
index 73f581b..0aa0d8d 100644
--- a/polymerist/polymers/monomers/repr.py
+++ b/polymerist/polymers/monomers/repr.py
@@ -170,7 +170,7 @@ def linear_end_groups(self) -> dict[str, tuple[str, Chem.Mol]]:
             for head_or_tail, (resname, rdmol) in zip(['head', 'tail'], self.iter_rdmols(term_only=True)): # zip will bottom out early if fewer than 2 terminal monomers are present
                 term_orient_auto[head_or_tail] = resname # populate purely for logging
                 end_groups_auto[head_or_tail]  = (resname, rdmol)
-            LOGGER.warning(f'No valid terminal monomer orientations defined; auto-assigned orientations "{term_orient_auto}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
+            LOGGER.warning(f'No valid terminal monomer orientations defined, auto-assigned orientations "{term_orient_auto}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!')
                 
             return end_groups_auto
     

From c991c17fcaebcef7a163932bf8f3ce28a4979e9f Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Tue, 10 Dec 2024 17:30:04 -0700
Subject: [PATCH 66/78] Fixed indent on serialize_openmm_pdb() arguments

---
 polymerist/mdtools/openmmtools/serialization.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/polymerist/mdtools/openmmtools/serialization.py b/polymerist/mdtools/openmmtools/serialization.py
index 521e8f0..d29f7cf 100644
--- a/polymerist/mdtools/openmmtools/serialization.py
+++ b/polymerist/mdtools/openmmtools/serialization.py
@@ -119,8 +119,15 @@ def serialize_system(sys_path : Path, system : System) -> None:
         file.write(XmlSerializer.serialize(system))
 
 @allow_string_paths
-def serialize_openmm_pdb(pdb_path : Path, topology : OpenMMTopology, positions : Union[NDArray, list[Vec3]], keep_chain_and_res_ids : bool=True,
-                         uniquify_atom_ids : bool=True, num_atom_id_digits : int=2, resname_map : Optional[dict[str, str]]=None) -> None:
+def serialize_openmm_pdb(
+        pdb_path : Path,
+        topology : OpenMMTopology,
+        positions : Union[NDArray, list[Vec3]],
+        keep_chain_and_res_ids : bool=True,
+        uniquify_atom_ids : bool=True,
+        num_atom_id_digits : int=2,
+        resname_map : Optional[dict[str, str]]=None
+    ) -> None:
     '''Configure and write an Protein DataBank File from an OpenMM Topology and array of positions
     Provides options to configure atom ID numbering, residue numbering, and residue naming'''
     if resname_map is None:

From 040a718ab4e8633d646ed12a80b467d06acc7355 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 13:49:58 -0700
Subject: [PATCH 67/78] Fixed accidental duplication of 3-functional TMC
 monomer fragment

---
 polymerist/polymers/monomers/fragments.py         | 2 +-
 polymerist/tests/polymers/building/test_linear.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/polymerist/polymers/monomers/fragments.py b/polymerist/polymers/monomers/fragments.py
index a563d87..a734e80 100644
--- a/polymerist/polymers/monomers/fragments.py
+++ b/polymerist/polymers/monomers/fragments.py
@@ -12,7 +12,7 @@
     'MPD-2': ['[*:1]-[#7D3+0:2](-[#6D3+0:3]1=[#6D3+0:4](-[#1D1+0:12])-[#6D3+0:5](-[#1D1+0:13])=[#6D3+0:6](-[#1D1+0:14])-[#6D3+0:7](-[#1D1+0:15])=[#6D3+0:8]-1-[#7D3+0:9](-[*:10])-[#1D1+0:16])-[#1D1+0:11]'],
     # TMC (trimesoyl chloride)
     'TMC-1': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[#17D1+0:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'],
-    'TMC-2': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
+    'TMC-2': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[#17D1+0:15]'],
     'TMC-3': ['[#6D3+0:1]1(-[#1D1+0:16])=[#6D3+0:2](-[#6D3+0:3](=[#8D1+0:4])-[*:5])-[#6D3+0:6](-[#1D1+0:17])=[#6D3+0:7](-[#6D3+0:8](=[#8D1+0:9])-[*:10])-[#6D3+0:11](-[#1D1+0:18])=[#6D3+0:12]-1-[#6D3+0:13](=[#8D1+0:14])-[*:15]'],
 }
 
diff --git a/polymerist/tests/polymers/building/test_linear.py b/polymerist/tests/polymers/building/test_linear.py
index f9be153..2c086a9 100644
--- a/polymerist/tests/polymers/building/test_linear.py
+++ b/polymerist/tests/polymers/building/test_linear.py
@@ -62,7 +62,7 @@ def monogrp_peg_plga() ->  MonomerGroup:
             )
         ),
         pytest.param(
-            'monogrp_mpd_tmc', {'head':'MPD-1', 'tail' : 'TMC-1'}, 8, 'AB', True, True, False, # will fail due to 3-functional TMC middle monomer as B
+            'monogrp_mpd_tmc', {'head':'MPD-1', 'tail' : 'TMC-1'}, 12, 'ABC', True, True, False, # will fail due to 3-functional TMC middle monomer as C
             marks=pytest.mark.xfail(
                 raises=MorphologyError,
                 reason='One of the monomers requested is non-linear (3-functional)',
@@ -74,7 +74,7 @@ def monogrp_peg_plga() ->  MonomerGroup:
         ('monogrp_peg_plga', {}, 17, 'ABC', True, False, False), # test autogen with whole sequence
         ('monogrp_peg_plga', {'head':'PGA-1A', 'tail' : 'PGA-1B'}, 15, 'ABC', True, True, False), # test more complex sequence with non-default explicit end groups
         pytest.param(
-            'monogrp_peg_plga', {'head':'PGA-1A', 'tail' : 'PGA-1B'}, 15, 'ABC', True, False, False, # will fail due to partial
+            'monogrp_peg_plga', {'head':'PGA-1A', 'tail' : 'PGA-1B'}, 15, 'ABC', True, False, False, # will fail due to partial sequence
             marks=pytest.mark.xfail(
                 raises=PartialBlockSequence,
                 reason='Partial sequence repeat needed to get odd number block out of AB, but partial blocks are disabled',

From bbd1b8583f2e5dc1d52819e38dfcf154f51d65c0 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 14:53:54 -0700
Subject: [PATCH 68/78] Added new subpackage for molecule file I/O

---
 polymerist/molfiles/__init__.py |  4 ++
 polymerist/molfiles/pdb.py      | 72 +++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 polymerist/molfiles/__init__.py
 create mode 100644 polymerist/molfiles/pdb.py

diff --git a/polymerist/molfiles/__init__.py b/polymerist/molfiles/__init__.py
new file mode 100644
index 0000000..314438a
--- /dev/null
+++ b/polymerist/molfiles/__init__.py
@@ -0,0 +1,4 @@
+'''Utilities for reading from and writing to various molecular file formats'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
\ No newline at end of file
diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py
new file mode 100644
index 0000000..d998a0f
--- /dev/null
+++ b/polymerist/molfiles/pdb.py
@@ -0,0 +1,72 @@
+'''PDB file formatting tools'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+from dataclasses import dataclass, field
+from collections import Counter
+
+
+@dataclass
+class SerialAtomLabeller:
+    '''
+    For assigning unique numbered atom names based on their
+    order of appearance within a molecule and elemental class
+    
+    Useful, for example, in generating unique atom names for a PDB file
+    
+    Parameters
+    ----------
+    atom_label_size : int , default 4      
+        Exact length alloted for any generated atom label
+        Labels shorter than this are right-padded with spaces,
+        while labels longer than this are truncated
+        
+        Default of 4 is the chosen to be compatible with the PDB specification ("Atom name: lines 13-16, left-justified")
+        https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
+    include_elem_idx : bool, default True  
+        Whether to attach a numerical element-index postfix to atom labels
+        
+        E.g. with atom_label_size=4, the fifth carbon in a topology  
+        will be labelled as "C004" with include_elem_idx=True, 
+        while labelled as "C   " with include_elem_idx=False, 
+    default_elem_idx : int, default 0
+        Starting index for each element category
+        By default, is 0-indexed; MUST BE POSITIVE
+    '''
+    atom_label_size  : int = 4
+    include_elem_idx : bool = True
+    default_elem_idx : int = 0
+    
+    element_counter : Counter = field(init=False, default_factory=Counter)
+    
+    def __post_init__(self) -> None:
+        '''Check ranges on input values'''
+        if self.atom_label_size < 0:
+            raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.num_idx_digits})')
+
+        if self.default_elem_idx < 0:
+            raise ValueError(f'Must provide a non-negative starting index for element indices (provided {self.default_elem_idx})')
+    
+    def get_atom_label(self, elem_symbol : str) -> str:
+        '''
+        Obtain a numbered atom label for an atom based on its element, 
+        updating the underlying element context in the process
+        '''
+        if elem_symbol not in self.element_counter: # initialize first occurence to starting value
+            self.element_counter[elem_symbol] = self.default_elem_idx
+            
+        atom_idx_label : str = ''
+        if self.include_elem_idx:
+            atom_idx = self.element_counter[elem_symbol]
+            num_idx_digits = max(self.atom_label_size - len(elem_symbol), 0) # number of symbols left over for an atom index
+            atom_idx_label = f'{atom_idx:0{num_idx_digits}d}'
+        
+        atom_name = f'{elem_symbol}{atom_idx_label}'
+        atom_name = atom_name.ljust(self.atom_label_size, ' ')[:self.atom_label_size] # pad with spaces if too short, or truncate if too long
+        assert(len(atom_name) <= self.atom_label_size) # perfunctory check to make sure things are working as expected
+        
+        self.element_counter[elem_symbol] += 1 # update tally with addition of new occurence of a particular element
+        
+        return atom_name
+    
\ No newline at end of file

From 34d1a7bf801f1aee1e718197677804b2587cdab4 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 15:53:04 -0700
Subject: [PATCH 69/78] Froze SerialAtomLabeller dataclass to avoid
 unintentional label format mutation

---
 polymerist/molfiles/pdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py
index d998a0f..751d611 100644
--- a/polymerist/molfiles/pdb.py
+++ b/polymerist/molfiles/pdb.py
@@ -7,7 +7,7 @@
 from collections import Counter
 
 
-@dataclass
+@dataclass(frozen=True)
 class SerialAtomLabeller:
     '''
     For assigning unique numbered atom names based on their

From d9afdc49f5a5efcc77ac412dca3be05bfc295192 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 15:53:51 -0700
Subject: [PATCH 70/78] Switched PDB atom labelled to dependency-injection
 based model

---
 .../mdtools/openmmtools/serialization.py      | 16 ++---
 polymerist/polymers/building/mbconvert.py     | 71 ++++++++-----------
 2 files changed, 36 insertions(+), 51 deletions(-)

diff --git a/polymerist/mdtools/openmmtools/serialization.py b/polymerist/mdtools/openmmtools/serialization.py
index d29f7cf..97348a6 100644
--- a/polymerist/mdtools/openmmtools/serialization.py
+++ b/polymerist/mdtools/openmmtools/serialization.py
@@ -23,6 +23,7 @@
 from ...genutils.fileutils.pathutils import assemble_path
 from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable
 from ...genutils.fileutils.jsonio.serialize import PathSerializer
+from ...molfiles.pdb import SerialAtomLabeller
 
 
 # DEFINING AND STORING SIMULATION PATHS
@@ -124,9 +125,8 @@ def serialize_openmm_pdb(
         topology : OpenMMTopology,
         positions : Union[NDArray, list[Vec3]],
         keep_chain_and_res_ids : bool=True,
-        uniquify_atom_ids : bool=True,
-        num_atom_id_digits : int=2,
-        resname_map : Optional[dict[str, str]]=None
+        atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(),
+        resname_map : Optional[dict[str, str]]=None,
     ) -> None:
     '''Configure and write an Protein DataBank File from an OpenMM Topology and array of positions
     Provides options to configure atom ID numbering, residue numbering, and residue naming'''
@@ -145,13 +145,9 @@ def serialize_openmm_pdb(
             residue.name = repl_res_name
 
     # individual atom config
-    element_counter = Counter() # for keeping track of the running index of each distinct element - could be used to produce a Hill formula
-    for atom in topology.atoms():
-        symbol = atom.element.symbol
-        atom_id = element_counter[symbol]
-        if uniquify_atom_ids:
-            atom.name = f'{symbol}{atom_id:0{num_atom_id_digits}d}' # extend atom name with ordered integer with specified number of digits (including leading zeros)
-        element_counter[symbol] += 1
+    if atom_labeller: # implicitly, preserves extant atom names if a labeller is not given
+        for atom in topology.atoms():
+            atom.name = atom_labeller.get_atom_label(atom.element.symbol)
 
     # file write
     with pdb_path.open('w') as file:
diff --git a/polymerist/polymers/building/mbconvert.py b/polymerist/polymers/building/mbconvert.py
index ebef303..8d0be5d 100644
--- a/polymerist/polymers/building/mbconvert.py
+++ b/polymerist/polymers/building/mbconvert.py
@@ -18,11 +18,7 @@
     )
 
 from typing import Optional
-
 from pathlib import Path
-from collections import Counter
-
-from rdkit import Chem
 
 import warnings
 with warnings.catch_warnings(record=True): # suppress numerous and irritating mbuild deprecation warnings
@@ -30,8 +26,11 @@
     from mbuild import Compound
     from mbuild.conversion import from_rdkit
     
-from ..monomers.specification import SANITIZE_AS_KEKULE
+from rdkit import Chem
+
 from ...genutils.decorators.functional import allow_string_paths, allow_pathlib_paths
+from ..monomers.specification import SANITIZE_AS_KEKULE
+from ...molfiles.pdb import SerialAtomLabeller
 from ...rdutils.bonding.portlib import get_linker_ids
 from ...rdutils.bonding.substitution import saturate_ports, hydrogenate_rdmol_ports
 from ...mdtools.openmmtools.serialization import serialize_openmm_pdb
@@ -57,22 +56,24 @@ def mbmol_from_mono_rdmol(rdmol : Chem.Mol, resname : Optional[str]=None) -> tup
     return mb_compound, linker_ids
    
 # Conversion from Compound to other formats
+_DEFAULT_RESNAME_MAP : dict[str, str] = { # module-wide config for default PDB residue name replacements for polymers
+    'RES' : 'Pol',
+}
+
 def mbmol_to_rdmol( # TODO: deduplify PDB atom name and residue numbering code against serialize_openmm_pdb()
         mbmol : Compound,
-        uniquify_atom_ids : bool=False,
-        num_atom_id_digits : int=2,
+        atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(),
         resname_map : Optional[dict[str, str]]=None
     ) -> Chem.Mol:
     '''Convert an mBuild Compound into an RDKit Mol, with correct atom coordinates and PDB residue info'''
     if resname_map is None:
-        resname_map = {}
+        resname_map = _DEFAULT_RESNAME_MAP
     
     rdmol = mbmol.to_rdkit()
     conformer = Chem.Conformer()
     conformer.Set3D(True)
 
     atom_id : int = 0
-    element_counter = Counter()
     for resnum, mb_monomer in enumerate(mbmol.children, start=1):
         resname = resname_map.get(mb_monomer.name, mb_monomer.name[:3]) # if no remapping is found, just take first 3 chars
         # NOTE: the order of monomers and atoms within those monomers were added in the same order as iterated over here...
@@ -82,64 +83,52 @@ def mbmol_to_rdmol( # TODO: deduplify PDB atom name and residue numbering code a
 
             # set PDB residue info if monomer hierarchy is present
             if mbatom != mb_monomer: # for Compounds with a flat hierarchy, the children and particles of children will coincide
-                symbol = mbatom.element.symbol
-                atom_ser_id = element_counter[symbol]
-                atom_ser_str = f'{atom_ser_id:0{num_atom_id_digits}d}' if uniquify_atom_ids else '  ' # double space keeps column justification correct when non-unique
-                atom_name = f' {symbol}{atom_ser_str}' # need a leading space to get column alignment in PDB compliant with spec
-                
                 pdb_info = Chem.AtomPDBResidueInfo(
-                    atomName=atom_name, 
+                    atomName=4*' ' if not atom_labeller else atom_labeller.get_atom_label(mbatom.element.symbol), 
                     residueName=resname,
                     residueNumber=resnum,
                     chainId='1',
                     isHeteroAtom=True,
                 )
-                element_counter[symbol] += 1 # only increment AFTER prior value has been assigned to the current atom
                 rdmol.GetAtomWithIdx(atom_id).SetPDBResidueInfo(pdb_info)
-            
             atom_id += 1 # TODO: this is an awful waay of keeping track of atom indices, see if there's a more secure way to do this
-    conf_id = rdmol.AddConformer(conformer)
+    conf_id = rdmol.AddConformer(conformer) # NOTE: recording this to self-document return values (this is intentionally not used)
     
     return rdmol
 
 # Serialization of Compounds to files
+@allow_pathlib_paths
+def mbmol_to_rdkit_pdb(
+        pdb_path : str,
+        mbmol : Compound, 
+        atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(),
+        resname_map : Optional[dict[str, str]]=None,
+    ) -> None:
+    '''Save an MBuild Compound into an RDKit-formatted PDB file'''
+    Chem.MolToPDBFile(
+        mbmol_to_rdmol(mbmol, atom_labeller=atom_labeller, resname_map=resname_map),
+        pdb_path,
+    )
+    
 @allow_string_paths
 def mbmol_to_openmm_pdb(
         pdb_path : Path,
         mbmol : Compound, 
-        num_atom_digits : int=2,
+        atom_labeller : Optional[SerialAtomLabeller]=SerialAtomLabeller(),
         resname_map : Optional[dict[str, str]]=None,
     ) -> None:
     '''Save an MBuild Compound into an OpenMM-formatted PDB file'''
     if resname_map is None: # avoid mutable default
-        resname_map = {'RES' : 'Pol'} 
+        resname_map = _DEFAULT_RESNAME_MAP 
 
     traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format)
     omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory
+    # TODO: add monomer name transfer to PDB residue names
 
     serialize_openmm_pdb(
         pdb_path,
         topology=omm_top,
         positions=omm_pos,
-        uniquify_atom_ids=True,
-        num_atom_id_digits=num_atom_digits,
-        resname_map=resname_map
-    )
-
-@allow_pathlib_paths
-def mbmol_to_rdkit_pdb(
-        pdb_path : str,
-        mbmol : Compound, 
-        num_atom_digits : int=2,
-        resname_map : Optional[dict[str, str]]=None,
-    ) -> None:
-    '''Save an MBuild Compound into an RDKit-formatted PDB file'''
-    Chem.MolToPDBFile(
-        mbmol_to_rdmol(
-            mbmol,
-            uniquify_atom_ids=True,
-            num_atom_id_digits=num_atom_digits,
-            resname_map=resname_map
-        ),
-        pdb_path,
+        atom_labeller=atom_labeller,
+        resname_map=resname_map,
     )
\ No newline at end of file

From 668c3f96fef9dd2edcfe5b9cbaf5156d75867449 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 16:04:17 -0700
Subject: [PATCH 71/78] Renamed "chain" to "polymer" where it occurs to avoid
 confusion with the related-but-distinct OpenMM notion of a Chain

---
 polymerist/polymers/building/linear.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/polymerist/polymers/building/linear.py b/polymerist/polymers/building/linear.py
index b1a5fd4..dab50f4 100644
--- a/polymerist/polymers/building/linear.py
+++ b/polymerist/polymers/building/linear.py
@@ -50,21 +50,21 @@ def build_linear_polymer(
     sequence_unique = unique_string(sequence_compliant, preserve_order=True) # only register a new monomer for each appearance of a new, unique symbol in the sequence
     
     # 2) REGISTERING MONOMERS TO BE USED FOR CHAIN ASSEMBLY
-    chain = MBPolymer() 
+    polymer = MBPolymer() 
     monomers_selected = MonomerGroup() # used to track and estimate sized of the monomers being used for building
     
     ## 2A) ADD MIDDLE MONOMERS TO CHAIN
     for symbol, (resname, middle_monomer) in zip(sequence_unique, monomers.iter_rdmols(term_only=False)): # zip with sequence limits number of middle monomers to length of block sequence
         LOGGER.info(f'Registering middle monomer {resname} (block identifier "{symbol}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(middle_monomer, resname=resname)
-        chain.add_monomer(compound=mb_monomer, indices=linker_ids)
+        polymer.add_monomer(compound=mb_monomer, indices=linker_ids)
         monomers_selected.monomers[resname] = monomers.monomers[resname]
 
     ## 2B) ADD TERMINAL MONOMERS TO CHAIN
     for head_or_tail, (resname, term_monomer) in end_groups.items():
         LOGGER.info(f'Registering terminal monomer {resname} (orientation "{head_or_tail}")')
         mb_monomer, linker_ids = mbmol_from_mono_rdmol(term_monomer, resname=resname)
-        chain.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
+        polymer.add_end_groups(compound=mb_monomer, index=linker_ids.pop(), label=head_or_tail, duplicate=False) # use single linker ID and provided head-tail orientation
         monomers_selected.monomers[resname] = monomers.monomers[resname]
 
     # 3) ASSEMBLE AND RETURN CHAIN
@@ -74,15 +74,15 @@ def build_linear_polymer(
     n_atoms_est = estimate_n_atoms_linear(monomers_selected, n_monomers) # TODO: create new MonomerGroup with ONLY the registered monomers to guarantee accuracy
     LOGGER.info(f'Assembling linear {n_monomers}-mer chain (estimated {n_atoms_est} atoms)')
     
-    chain.build(n_seq_repeats, sequence=sequence_compliant, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
-    for atom in chain.particles():
+    polymer.build(n_seq_repeats, sequence=sequence_compliant, add_hydrogens=add_Hs) # "-2" is to account for term groups (in mbuild, "n" is the number of times to replicate just the middle monomers)
+    for atom in polymer.particles():
         atom.charge = 0.0 # initialize all atoms as being uncharged (gets rid of pesky blocks of warnings)
-    LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {chain.n_particles} atoms)')
+    LOGGER.info(f'Successfully assembled linear {n_monomers}-mer chain (exactly {polymer.n_particles} atoms)')
     
     # 4) OPTIONALLY, PERFORM FINAL UFF ENERGY MINIMIZATION
     if energy_minimize:
         LOGGER.info('Energy-minimizing chain to find more stable conformer')
-        chain.energy_minimize()
+        polymer.energy_minimize()
         LOGGER.info('Energy minimization completed')
 
-    return chain
\ No newline at end of file
+    return polymer
\ No newline at end of file

From a5cc442bccfa42597a9da65f6b29f53a1e2bd2dd Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 16:06:52 -0700
Subject: [PATCH 72/78] Added placeholder unit tests for newly-created
 `molfiles` subpackage

---
 polymerist/tests/molfiles/__init__.py | 4 ++++
 polymerist/tests/molfiles/test_pdb.py | 4 ++++
 2 files changed, 8 insertions(+)
 create mode 100644 polymerist/tests/molfiles/__init__.py
 create mode 100644 polymerist/tests/molfiles/test_pdb.py

diff --git a/polymerist/tests/molfiles/__init__.py b/polymerist/tests/molfiles/__init__.py
new file mode 100644
index 0000000..d92450c
--- /dev/null
+++ b/polymerist/tests/molfiles/__init__.py
@@ -0,0 +1,4 @@
+'''Unit tests for `molfiles` package'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
diff --git a/polymerist/tests/molfiles/test_pdb.py b/polymerist/tests/molfiles/test_pdb.py
new file mode 100644
index 0000000..f7795b9
--- /dev/null
+++ b/polymerist/tests/molfiles/test_pdb.py
@@ -0,0 +1,4 @@
+'''Unit tests for PDB file I/O utils'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'

From 4d128e3b50c3fe5c0e60a710462d651cbc24d03f Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 16:51:22 -0700
Subject: [PATCH 73/78] Added residue info injection into mbmol_to_openmm_pdb
 (PDB outputs are now totally consistent with RDKit PDB output)

---
 polymerist/polymers/building/mbconvert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/polymerist/polymers/building/mbconvert.py b/polymerist/polymers/building/mbconvert.py
index 8d0be5d..61656d9 100644
--- a/polymerist/polymers/building/mbconvert.py
+++ b/polymerist/polymers/building/mbconvert.py
@@ -121,9 +121,10 @@ def mbmol_to_openmm_pdb(
     if resname_map is None: # avoid mutable default
         resname_map = _DEFAULT_RESNAME_MAP 
 
-    traj = mbmol.to_trajectory() # first convert to MDTraj representation (much more infor-rich format)
+     # NOTE: converting through MDTraj first before going to OpenMM preserves much
+     # of the necessary chemical info that is discarded when converting through other formats
+    traj = mbmol.to_trajectory(residues=[residue.name for residue in mbmol.children]) # extract names of repeat units
     omm_top, omm_pos = traj.top.to_openmm(), traj.openmm_positions(0) # extract OpenMM representations of trajectory
-    # TODO: add monomer name transfer to PDB residue names
 
     serialize_openmm_pdb(
         pdb_path,

From f1f8039a5a81e5ab16ac3bd64e5fceecadfdf251 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 17:16:31 -0700
Subject: [PATCH 74/78] Renamed "atom_label_size" to "atom_label_length" for
 clarity

---
 polymerist/molfiles/pdb.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py
index 751d611..dc9f592 100644
--- a/polymerist/molfiles/pdb.py
+++ b/polymerist/molfiles/pdb.py
@@ -17,7 +17,7 @@ class SerialAtomLabeller:
     
     Parameters
     ----------
-    atom_label_size : int , default 4      
+    atom_label_length : int , default 4      
         Exact length alloted for any generated atom label
         Labels shorter than this are right-padded with spaces,
         while labels longer than this are truncated
@@ -27,22 +27,22 @@ class SerialAtomLabeller:
     include_elem_idx : bool, default True  
         Whether to attach a numerical element-index postfix to atom labels
         
-        E.g. with atom_label_size=4, the fifth carbon in a topology  
+        E.g. with atom_label_length=4, the fifth carbon in a topology  
         will be labelled as "C004" with include_elem_idx=True, 
         while labelled as "C   " with include_elem_idx=False, 
     default_elem_idx : int, default 0
         Starting index for each element category
         By default, is 0-indexed; MUST BE POSITIVE
     '''
-    atom_label_size  : int = 4
-    include_elem_idx : bool = True
-    default_elem_idx : int = 0
+    atom_label_length : int = 4
+    include_elem_idx  : bool = True
+    default_elem_idx  : int = 0
     
     element_counter : Counter = field(init=False, default_factory=Counter)
     
     def __post_init__(self) -> None:
         '''Check ranges on input values'''
-        if self.atom_label_size < 0:
+        if self.atom_label_length < 0:
             raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.num_idx_digits})')
 
         if self.default_elem_idx < 0:
@@ -59,12 +59,12 @@ def get_atom_label(self, elem_symbol : str) -> str:
         atom_idx_label : str = ''
         if self.include_elem_idx:
             atom_idx = self.element_counter[elem_symbol]
-            num_idx_digits = max(self.atom_label_size - len(elem_symbol), 0) # number of symbols left over for an atom index
+            num_idx_digits = max(self.atom_label_length - len(elem_symbol), 0) # number of symbols left over for an atom index
             atom_idx_label = f'{atom_idx:0{num_idx_digits}d}'
         
         atom_name = f'{elem_symbol}{atom_idx_label}'
-        atom_name = atom_name.ljust(self.atom_label_size, ' ')[:self.atom_label_size] # pad with spaces if too short, or truncate if too long
-        assert(len(atom_name) <= self.atom_label_size) # perfunctory check to make sure things are working as expected
+        atom_name = atom_name.ljust(self.atom_label_length, ' ')[:self.atom_label_length] # pad with spaces if too short, or truncate if too long
+        assert(len(atom_name) <= self.atom_label_length) # perfunctory check to make sure things are working as expected
         
         self.element_counter[elem_symbol] += 1 # update tally with addition of new occurence of a particular element
         

From 370dc3aeb5b913c3fddfc4cc7c6913198c980415 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 17:33:33 -0700
Subject: [PATCH 75/78] Renamed once more to atom_label_width

---
 polymerist/molfiles/pdb.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py
index dc9f592..b0fe63b 100644
--- a/polymerist/molfiles/pdb.py
+++ b/polymerist/molfiles/pdb.py
@@ -17,7 +17,7 @@ class SerialAtomLabeller:
     
     Parameters
     ----------
-    atom_label_length : int , default 4      
+    atom_label_width : int , default 4      
         Exact length alloted for any generated atom label
         Labels shorter than this are right-padded with spaces,
         while labels longer than this are truncated
@@ -27,22 +27,22 @@ class SerialAtomLabeller:
     include_elem_idx : bool, default True  
         Whether to attach a numerical element-index postfix to atom labels
         
-        E.g. with atom_label_length=4, the fifth carbon in a topology  
+        E.g. with atom_label_width=4, the fifth carbon in a topology  
         will be labelled as "C004" with include_elem_idx=True, 
         while labelled as "C   " with include_elem_idx=False, 
     default_elem_idx : int, default 0
         Starting index for each element category
         By default, is 0-indexed; MUST BE POSITIVE
     '''
-    atom_label_length : int = 4
-    include_elem_idx  : bool = True
-    default_elem_idx  : int = 0
+    atom_label_width : int = 4
+    include_elem_idx : bool = True
+    default_elem_idx : int = 0
     
     element_counter : Counter = field(init=False, default_factory=Counter)
     
     def __post_init__(self) -> None:
         '''Check ranges on input values'''
-        if self.atom_label_length < 0:
+        if self.atom_label_width < 0:
             raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.num_idx_digits})')
 
         if self.default_elem_idx < 0:
@@ -59,12 +59,12 @@ def get_atom_label(self, elem_symbol : str) -> str:
         atom_idx_label : str = ''
         if self.include_elem_idx:
             atom_idx = self.element_counter[elem_symbol]
-            num_idx_digits = max(self.atom_label_length - len(elem_symbol), 0) # number of symbols left over for an atom index
+            num_idx_digits = max(self.atom_label_width - len(elem_symbol), 0) # number of symbols left over for an atom index
             atom_idx_label = f'{atom_idx:0{num_idx_digits}d}'
         
         atom_name = f'{elem_symbol}{atom_idx_label}'
-        atom_name = atom_name.ljust(self.atom_label_length, ' ')[:self.atom_label_length] # pad with spaces if too short, or truncate if too long
-        assert(len(atom_name) <= self.atom_label_length) # perfunctory check to make sure things are working as expected
+        atom_name = atom_name.ljust(self.atom_label_width, ' ')[:self.atom_label_width] # pad with spaces if too short, or truncate if too long
+        assert(len(atom_name) <= self.atom_label_width) # perfunctory check to make sure things are working as expected
         
         self.element_counter[elem_symbol] += 1 # update tally with addition of new occurence of a particular element
         

From c61f16a99e9bb7afeb361c5a641eb163f93dd3c8 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 17:41:17 -0700
Subject: [PATCH 76/78] Fixed non-attribute value in atom_label_width Exception
 message

---
 polymerist/molfiles/pdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py
index b0fe63b..336e382 100644
--- a/polymerist/molfiles/pdb.py
+++ b/polymerist/molfiles/pdb.py
@@ -43,7 +43,7 @@ class SerialAtomLabeller:
     def __post_init__(self) -> None:
         '''Check ranges on input values'''
         if self.atom_label_width < 0:
-            raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.num_idx_digits})')
+            raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.atom_label_width})')
 
         if self.default_elem_idx < 0:
             raise ValueError(f'Must provide a non-negative starting index for element indices (provided {self.default_elem_idx})')

From e40600ec158d611cb5d9a06b2d1d4523ec1ac2e0 Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 17:45:17 -0700
Subject: [PATCH 77/78] Added string type check for atom element symbols

---
 polymerist/molfiles/pdb.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py
index 336e382..1852423 100644
--- a/polymerist/molfiles/pdb.py
+++ b/polymerist/molfiles/pdb.py
@@ -53,6 +53,9 @@ def get_atom_label(self, elem_symbol : str) -> str:
         Obtain a numbered atom label for an atom based on its element, 
         updating the underlying element context in the process
         '''
+        if not isinstance(elem_symbol, str):
+            raise TypeError(f'Must pass symbol of atom\'s element as str (not type {type(elem_symbol).__name__})')
+        
         if elem_symbol not in self.element_counter: # initialize first occurence to starting value
             self.element_counter[elem_symbol] = self.default_elem_idx
             

From 5c282d73932cd98090a24894e8f957eae1e7681c Mon Sep 17 00:00:00 2001
From: Timotej Bernat <tibe3324@colorado.edu>
Date: Wed, 11 Dec 2024 17:45:33 -0700
Subject: [PATCH 78/78] Wrote unit tests for molfiles.pdb.SerialAtomLabeller

---
 polymerist/tests/molfiles/test_pdb.py | 58 +++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/polymerist/tests/molfiles/test_pdb.py b/polymerist/tests/molfiles/test_pdb.py
index f7795b9..9be9f3a 100644
--- a/polymerist/tests/molfiles/test_pdb.py
+++ b/polymerist/tests/molfiles/test_pdb.py
@@ -2,3 +2,61 @@
 
 __author__ = 'Timotej Bernat'
 __email__ = 'timotej.bernat@colorado.edu'
+
+import pytest
+from polymerist.molfiles.pdb import SerialAtomLabeller
+
+
+ELEMS : tuple[str] = ('C', 'H', 'H', 'H', 'N', 'H', 'C', 'O', 'Cl') # atoms for methylcarbamoyl chloride (MCC)
+
+@pytest.mark.parametrize(
+    'mol_atom_elems, atom_label_width, include_elem_idx, default_elem_idx, expected_labels',
+    [
+        (ELEMS, 4, True, 0, ['C000', 'H000', 'H001', 'H002', 'N000', 'H003', 'C001', 'O000', 'Cl00']), # test with default PDD-compatible settings
+        (ELEMS, 4, True, 2, ['C002', 'H002', 'H003', 'H004', 'N002', 'H005', 'C003', 'O002', 'Cl02']), # test element index offset
+        (ELEMS, 3, True, 2, ['C02', 'H02', 'H03', 'H04', 'N02', 'H05', 'C03', 'O02', 'Cl2']), # test shorter atom label width
+        (ELEMS, 1, True, 0, ['C', 'H', 'H', 'H', 'N', 'H', 'C', 'O', 'C']), # test truncation works below threshold where indices can be written
+        (ELEMS, 4, False, 0, ['C   ', 'H   ', 'H   ', 'H   ', 'N   ', 'H   ', 'C   ', 'O   ', 'Cl  ']), # test without element indices
+        (ELEMS, 4, False, 7, ['C   ', 'H   ', 'H   ', 'H   ', 'N   ', 'H   ', 'C   ', 'O   ', 'Cl  ']), # test that default indices has no impact when indices aren't present
+        (ELEMS, 0, False, 0, ['', '', '', '', '', '', '', '', '']), # test null-width labels
+        # Invalid input handling checks
+        pytest.param(
+            ELEMS, -1, True, 0, [], # test that negative label width is rejected as intended
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+                reason='Negative atom label widths not allowed',
+                strict=True,
+            )
+        ),
+        pytest.param(
+            ELEMS, 4, True, -5, [], # test that negative default indices are rejected as intended
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+                reason='Negative element indices not allowed',
+                strict=True,
+            )
+        ),
+        pytest.param(
+            tuple(len(elem) for elem in ELEMS), 4, True, 0, [], # test that negative default indices are rejected as intended
+            marks=pytest.mark.xfail(
+                raises=TypeError,
+                reason='Must pass atom elements as strings',
+                strict=True,
+            )
+        ),
+    ]
+)
+def test_atom_labeller(
+        mol_atom_elems : tuple[str],
+        atom_label_width : int,
+        include_elem_idx : bool,
+        default_elem_idx : int,
+        expected_labels : list[str],
+    ) -> None:
+    '''Test that atom labelling hebaves as expected with various label formatting configurations'''
+    labeller = SerialAtomLabeller(
+        atom_label_width=atom_label_width,
+        include_elem_idx=include_elem_idx,
+        default_elem_idx=default_elem_idx,
+    )
+    assert [labeller.get_atom_label(elem) for elem in mol_atom_elems] == expected_labels
\ No newline at end of file