rdkit · chem-bio · Dec 20, 2023 · Oct 12, 2021 · Oct 12, 2021 · Oct 13, 2021
diff --git a/mmpdblib/commandline.py b/mmpdblib/commandline.py
@@ -158,18 +158,18 @@ def add_in_memory(parser):
 
 2) Read from a gzip-compressed tab-delimited SMILES file. Use 8
 threads to fragment the structures. Save the results to
-dataset.fragments.gz .
+dataset.fragdb
 
   % mmpa fragment --delimiter tab dataset.smi.gz --num-jobs 8 \\
-      -o dataset.fragments.gz
+      -o dataset.fragdb
 
 3) Fragment the SMILES in 'dataset.smi.gz'. Reuse fragment information
-from the cache file 'old_dataset.fragments.gz' if possible, instead of
+from the cache file 'old_dataset.fragdb' if possible, instead of
 computing the fragments from scratch each time. Save the results to
-'new_dataset.fragments.gz'.
+'new_dataset.fragdb'.
 
-  % mmpa fragment --cache old_dataset.fragments.gz dataset.smi.gz \\
-      -o new_dataset.fragments.gz
+  % mmpa fragment --cache old_dataset.fragdb dataset.smi.gz \\
+      -o new_dataset.fragdb
 
 
 """ + smarts_aliases.get_epilog("--cut-smarts", smarts_aliases.cut_smarts_aliases)
@@ -196,10 +196,7 @@ def fragment_command(parser, args):
 p.add_argument("--has-header", default=False, action="store_true",
                help="skip the first line, which is the header line")
 p.add_argument("--output", "-o", metavar="FILENAME",
-               help="save the fragment data to FILENAME (default=stdout)")
-p.add_argument("--out", metavar="FORMAT", choices=("fragments", "fragments.gz", "fraginfo", "fraginfo.gz"),
-               help="output format. One of 'fragments' or 'fragments.gz'. "
-               "If not present, guess from the filename, and default to 'fragments'")
+               help="save the fragment data to FILENAME (default: based on the structure filename)")
 p.add_argument("structure_filename", nargs="?", default=None,
                help="SMILES filename (default: read from stdin)")
 
@@ -503,6 +500,28 @@ def list_command(parser, args):
                subparser=p)
 
 
+#### mmpdb smi_split
+
+# mmpdb smi_split blah.smi.gz --num-records 1000 --template "{basename}.{i}.smi"
+# mmpdb smi_split blah.smi.gz --num-files 5 --template "{basename}.{i}.smi"
+
+
+#### mmpdb fragdb_stats
+
+
+#### mmpdb fragdb_merge
+
+# mmpdb fragdb_merge blah.*.fragdb -o blah.fragdb
+
+#### mmpdb fragdb_split
+
+# mmpdb fragdb_split blah.fragdb --template "{basename}.{i}.fragdb" --jobs blah.txt
+#   mmpdb index {basename}.{i}.fragdb --no-H '{basename}.{i}.fragdb'
+#   mmpdb index {basename}.fragdb --H -o '{basename}.H.fragdb'
+
+# mmpdb mmpdb_join blah.*.fragdb -o blah.fragdb
+
+
 #### mmpdb loadprops
 
 p = loadprops_parser = subparsers.add_parser(

diff --git a/mmpdblib/config.py b/mmpdblib/config.py
@@ -39,6 +39,7 @@
 
 from ._compat import basestring
 from . import smarts_aliases
+from . import fragment_types
 
 # Things to pass as the ArgumentParser argument's 'type'
 
@@ -137,59 +138,7 @@ def parse_method_value(value):
     return value
 
 
-class FragmentOptions(object):
-    def __init__(self, max_heavies, max_rotatable_bonds,
-                 rotatable_smarts, cut_smarts, num_cuts,
-                 method, salt_remover, min_heavies_per_const_frag):
-        assert isinstance(max_heavies, int) or max_heavies is None, max_heavies
-        self.max_heavies = max_heavies
-
-        assert isinstance(max_rotatable_bonds, int) or max_rotatable_bonds is None, max_rotatable_bonds
-        self.max_rotatable_bonds = max_rotatable_bonds
-
-        assert isinstance(rotatable_smarts, str), rotatable_smarts
-        self.rotatable_smarts = rotatable_smarts
-
-        assert isinstance(cut_smarts, str), cut_smarts
-        self.cut_smarts = cut_smarts
-
-        assert num_cuts in (1, 2, 3), num_cuts
-        self.num_cuts = num_cuts
-
-        assert method in ("chiral",)
-        self.method = method
-
-        assert isinstance(salt_remover, basestring), salt_remover
-        self.salt_remover = salt_remover
-
-        assert isinstance(min_heavies_per_const_frag, int), min_heavies_per_const_frag
-        self.min_heavies_per_const_frag = min_heavies_per_const_frag
-
-    def to_dict(self):
-        d = OrderedDict()
-        for name in ("max_heavies", "max_rotatable_bonds", "rotatable_smarts",
-                     "cut_smarts", "num_cuts", "method", "salt_remover",
-                     "min_heavies_per_const_frag"):
-            d[name] = getattr(self, name)
-        return d
-
-    def to_text_settings(self):
-        def _none(x):
-            return "none" if x is None else str(x)
-        return (
-            ("max_heavies", _none(self.max_heavies)),
-            ("max_rotatable_bonds", _none(self.max_rotatable_bonds)),
-            ("rotatable_smarts", self.rotatable_smarts),
-            ("cut_smarts", self.cut_smarts),
-            ("num_cuts", str(self.num_cuts)),
-            ("method", self.method),
-            ("salt_remover", self.salt_remover),
-            ("min_heavies_per_const_frag", str(self.min_heavies_per_const_frag))
-
-            )
-
-
-DEFAULT_FRAGMENT_OPTIONS = FragmentOptions(
+DEFAULT_FRAGMENT_OPTIONS = fragment_types.FragmentOptions(
     max_heavies=100,
     max_rotatable_bonds=10,
     rotatable_smarts="[!$([NH]!@C(=O))&!D1&!$(*#*)]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)]",
@@ -243,10 +192,6 @@ def add_fragment_arguments(parser):
                    help="Ignore fragmentations where one or more constant fragments are very small (default: %r)"
                          % (OPTS.min_heavies_per_const_frag,))
 
-    ## p.add_argument("--method", choices=("dalke", "hussain"), type=fragment_io.parse_method_value,
-    ##                help="fragment canonicalization method to use (default: %s)"
-    ##                % (FRAGMENT_OPTIONS.method,))
-
 ###### Index
 
 parse_min_variable_heavies_value = nonnegative_int

diff --git a/mmpdblib/do_fragment.py b/mmpdblib/do_fragment.py
@@ -46,7 +46,7 @@
 from . import reporters
 from . import fileio
 from . import fragment_algorithm
-from . import fragment_io
+from . import fragment_db
 from . import fragment_types
 from . import smarts_aliases
 
@@ -175,7 +175,7 @@ def get(name):
     if min_heavies_per_const_frag == "none":
         min_heavies_per_const_frag = 0
 
-    return specified_args, config.FragmentOptions(
+    return specified_args, fragment_types.FragmentOptions(
         max_heavies = max_heavies,
         max_rotatable_bonds = max_rotatable_bonds,
         rotatable_smarts =rotatable_smarts,
@@ -535,6 +535,12 @@ def fragment_command(parser, args):
     except ValueError as err:
         sys.stderr.write(str(err) + "\n")
         raise SystemExit(1)
+
+    structure_filename = args.structure_filename
+    output_filename = args.output
+    if output_filename is None:
+        output_filename = fileio.remove_suffixes(structure_filename) + ".fragdb"
+        reporter.report(f"Using {output_filename!r} as the default --output file.")
 
     # Use a cache?
     cache = None
@@ -546,7 +552,7 @@ def fragment_command(parser, args):
             raise AssertionError("Should not get here")
 
         try:
-            cache = fragment_io.load_cache(args.cache, reporter)
+            cache = fragment_db.load_cache(args.cache, reporter)
         except IOError as err:
             parser.error("Cannot open cache: %s" % (err,))
         except ValueError as err:
@@ -573,9 +579,9 @@ def fragment_command(parser, args):
         try:
             with fileio.read_smiles_file(args.structure_filename, args.format,
                                          args.delimiter, args.has_header) as reader:
-                with fragment_io.open_fragment_writer(filename = args.output,
+                with fragment_db.open_fragment_writer(output_filename,
                                                       options = fragment_filter.options,
-                                                      format_hint = args.out) as writer:
+                                                      ) as writer:
                     records = make_fragment_records(reader, fragment_filter, cache,
                                                     pool=pool, reporter=reporter)
                     writer.write_records(records)
@@ -597,7 +603,7 @@ def fragment_command(parser, args):
         reporter.update("")
 
         ## with fileio.open_output(args.output, args.out) as outfile:
-        ##     fragment_io.write_fragment_records(outfile, records, )
+        ##     fragment_db.write_fragment_records(outfile, records, )
 
 def smifrag_command(parser, args):
     reporter = command_support.get_reporter(args.quiet)
@@ -611,9 +617,6 @@ def smifrag_command(parser, args):
     if record.errmsg:
         parser.error("Cannot parse --smiles: %s" % (record.errmsg,))
 
-    #writer = fragment_io.FragInfoWriter(None, sys.stdout, None)
-    #writer.write_records([record])
-
     columns = [["#cuts"], ["enum.label"],
                ["#heavies"], ["symm.class"], ["smiles"],
                ["order"],
@@ -624,7 +627,7 @@ def smifrag_command(parser, args):
               "right", "center", "left", "left"]
 
     has_rows = False
-    for frag in record.fragments:
+    for frag in record.fragmentations:
         has_rows = True
         items = [str(frag.num_cuts),
                  frag.enumeration_label,

diff --git a/mmpdblib/do_index.py b/mmpdblib/do_index.py
@@ -37,7 +37,7 @@
 
 from . import command_support
 from . import index_algorithm
-from . import fragment_io
+from . import fragment_db
 from . import fragment_types
 from . import properties_io
 from ._compat import open_universal
@@ -151,23 +151,24 @@ def index_command(parser, args):
         selected_ids = set(properties.get_ids())
     end_properties_memory = get_memory_use()
 
+    fragment_filename = args.fragment_filename
+    if fragment_filename is None:
+        # Not specified so use the default name.
+        # XXX warning message?
+        fragment_filename = "input.fragdb"
+
     if (args.out is None or args.out == "mmpdb") and args.output is None:
         # Use the filename based on the fragments filename
-        fragment_filename = args.fragment_filename
-        if fragment_filename is None:
-            parser.error("The '--out mmpdb' format requires a filename when reading from stdin.")
 
         # replace the extension (if any) with ".mmpdb"
         args.output = os.path.splitext(fragment_filename)[0] + ".mmpdb"
         reporter.warning("No --output filename specified. Saving to %r." % (args.output,))
 
     #reporter.report("Using fragment filters: %s" % (fragment_filter.get_args(),))
 
-    fragment_io.suggest_faster_json(reporter)
-
     start_fragment_index_memory = get_memory_use()
     try:
-        fragment_reader = fragment_io.read_fragment_records(args.fragment_filename)
+        fragment_reader = fragment_db.read_fragment_records(fragment_filename)
     except fragment_types.FragmentFormatError as err:
         parser.error(str(err))
 

diff --git a/mmpdblib/fileio.py b/mmpdblib/fileio.py
@@ -28,7 +28,7 @@
 
 import sys
 import gzip
-#import io
+import os
 
 from ._compat import basestring, open_universal, io_wrapper
 from ._compat import binary_stdin, binary_stdout
@@ -469,7 +469,20 @@ def _read_to_eol(infile, close, first_lineno, location):
     "native": _read_whitespace,
     }
 
+def remove_suffixes(filename):
+    """Remove .gz (if present) then any additional suffix
 
+    Used to get the '/a/x' in '/a/x.smi.gz' or '/a/x.smi'.
+    """
+    left, ext = os.path.splitext(filename)
+    if ext.lower() == ".gz":
+        left, ext = os.path.splitext(left)
+    if not left:
+        # Handle odd names like ".smi", ".gz, and ".smi.gz".
+        return "input"
+    return left
+
+
 def read_smiles_file(filename, format=None, delimiter="whitespace", has_header=False):
     if format is None:
         if filename is not None and filename.lower().endswith(".gz"):

diff --git a/mmpdblib/fragment_algorithm.py b/mmpdblib/fragment_algorithm.py
@@ -37,6 +37,7 @@
 from rdkit import Chem
 import itertools
 from . import smiles_syntax  # for validation
+from .fragment_types import Fragmentation
 
 #####
 
@@ -47,39 +48,6 @@ class EnumerationLabel(object):
     VARIABLE_UP_ENUMERATION = "V"
 
 
-class Fragmentation(object):
-    __slots__ = ("num_cuts", "enumeration_label",
-                 "variable_num_heavies", "variable_symmetry_class", "variable_smiles",
-                 "attachment_order",
-                 "constant_num_heavies", "constant_symmetry_class", "constant_smiles", "constant_with_H_smiles")
-    def __init__(self,
-                 num_cuts, enumeration_label,
-                 variable_num_heavies, variable_symmetry_class, variable_smiles,
-                 attachment_order,
-                 constant_num_heavies, constant_symmetry_class, constant_smiles, constant_with_H_smiles):
-        self.num_cuts = num_cuts
-        self.enumeration_label = enumeration_label
-        self.variable_num_heavies = variable_num_heavies
-        self.variable_symmetry_class = variable_symmetry_class
-        self.variable_smiles = variable_smiles
-        self.attachment_order = attachment_order
-        self.constant_num_heavies = constant_num_heavies
-        self.constant_symmetry_class = constant_symmetry_class
-        self.constant_smiles = constant_smiles
-        self.constant_with_H_smiles = constant_with_H_smiles
-
-    def __repr__(self):
-        return ("Fragmentation({self.num_cuts}, {self.enumeration_label!r}, "
-                "{self.variable_num_heavies}, {self.variable_symmetry_class!r}, {self.variable_smiles!r}, "
-                "{self.attachment_order!r}, "
-                "{self.constant_num_heavies}, {self.constant_symmetry_class!r}, "
-                "{self.constant_smiles!r}, {self.constant_with_H_smiles!r})").format(
-                    self=self)
-
-    def get_unique_key(self):
-        return "%s.%s.%s" % (self.attachment_order, self.variable_smiles, self.constant_smiles)
-
-
 #####
 
 # TODO: Move some of these into smiles_syntax.py