Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Fragdb proposal #39

Merged
merged 14 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 29 additions & 10 deletions mmpdblib/commandline.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,18 +158,18 @@ def add_in_memory(parser):

2) Read from a gzip-compressed tab-delimited SMILES file. Use 8
threads to fragment the structures. Save the results to
dataset.fragments.gz .
dataset.fragdb

% mmpa fragment --delimiter tab dataset.smi.gz --num-jobs 8 \\
-o dataset.fragments.gz
-o dataset.fragdb

3) Fragment the SMILES in 'dataset.smi.gz'. Reuse fragment information
from the cache file 'old_dataset.fragments.gz' if possible, instead of
from the cache file 'old_dataset.fragdb' if possible, instead of
computing the fragments from scratch each time. Save the results to
'new_dataset.fragments.gz'.
'new_dataset.fragdb'.

% mmpa fragment --cache old_dataset.fragments.gz dataset.smi.gz \\
-o new_dataset.fragments.gz
% mmpa fragment --cache old_dataset.fragdb dataset.smi.gz \\
-o new_dataset.fragdb


""" + smarts_aliases.get_epilog("--cut-smarts", smarts_aliases.cut_smarts_aliases)
Expand All @@ -196,10 +196,7 @@ def fragment_command(parser, args):
p.add_argument("--has-header", default=False, action="store_true",
help="skip the first line, which is the header line")
p.add_argument("--output", "-o", metavar="FILENAME",
help="save the fragment data to FILENAME (default=stdout)")
p.add_argument("--out", metavar="FORMAT", choices=("fragments", "fragments.gz", "fraginfo", "fraginfo.gz"),
help="output format. One of 'fragments' or 'fragments.gz'. "
"If not present, guess from the filename, and default to 'fragments'")
help="save the fragment data to FILENAME (default: based on the structure filename)")
p.add_argument("structure_filename", nargs="?", default=None,
help="SMILES filename (default: read from stdin)")

Expand Down Expand Up @@ -503,6 +500,28 @@ def list_command(parser, args):
subparser=p)


#### mmpdb smi_split

# mmpdb smi_split blah.smi.gz --num-records 1000 --template "{basename}.{i}.smi"
# mmpdb smi_split blah.smi.gz --num-files 5 --template "{basename}.{i}.smi"


#### mmpdb fragdb_stats


#### mmpdb fragdb_merge

# mmpdb fragdb_merge blah.*.fragdb -o blah.fragdb

#### mmpdb fragdb_split

# mmpdb fragdb_split blah.fragdb --template "{basename}.{i}.fragdb" --jobs blah.txt
# mmpdb index {basename}.{i}.fragdb --no-H '{basename}.{i}.fragdb'
# mmpdb index {basename}.fragdb --H -o '{basename}.H.fragdb'

# mmpdb mmpdb_join blah.*.fragdb -o blah.fragdb


#### mmpdb loadprops

p = loadprops_parser = subparsers.add_parser(
Expand Down
59 changes: 2 additions & 57 deletions mmpdblib/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

from ._compat import basestring
from . import smarts_aliases
from . import fragment_types

# Things to pass as the ArgumentParser argument's 'type'

Expand Down Expand Up @@ -137,59 +138,7 @@ def parse_method_value(value):
return value


class FragmentOptions(object):
def __init__(self, max_heavies, max_rotatable_bonds,
rotatable_smarts, cut_smarts, num_cuts,
method, salt_remover, min_heavies_per_const_frag):
assert isinstance(max_heavies, int) or max_heavies is None, max_heavies
self.max_heavies = max_heavies

assert isinstance(max_rotatable_bonds, int) or max_rotatable_bonds is None, max_rotatable_bonds
self.max_rotatable_bonds = max_rotatable_bonds

assert isinstance(rotatable_smarts, str), rotatable_smarts
self.rotatable_smarts = rotatable_smarts

assert isinstance(cut_smarts, str), cut_smarts
self.cut_smarts = cut_smarts

assert num_cuts in (1, 2, 3), num_cuts
self.num_cuts = num_cuts

assert method in ("chiral",)
self.method = method

assert isinstance(salt_remover, basestring), salt_remover
self.salt_remover = salt_remover

assert isinstance(min_heavies_per_const_frag, int), min_heavies_per_const_frag
self.min_heavies_per_const_frag = min_heavies_per_const_frag

def to_dict(self):
d = OrderedDict()
for name in ("max_heavies", "max_rotatable_bonds", "rotatable_smarts",
"cut_smarts", "num_cuts", "method", "salt_remover",
"min_heavies_per_const_frag"):
d[name] = getattr(self, name)
return d

def to_text_settings(self):
def _none(x):
return "none" if x is None else str(x)
return (
("max_heavies", _none(self.max_heavies)),
("max_rotatable_bonds", _none(self.max_rotatable_bonds)),
("rotatable_smarts", self.rotatable_smarts),
("cut_smarts", self.cut_smarts),
("num_cuts", str(self.num_cuts)),
("method", self.method),
("salt_remover", self.salt_remover),
("min_heavies_per_const_frag", str(self.min_heavies_per_const_frag))

)


DEFAULT_FRAGMENT_OPTIONS = FragmentOptions(
DEFAULT_FRAGMENT_OPTIONS = fragment_types.FragmentOptions(
max_heavies=100,
max_rotatable_bonds=10,
rotatable_smarts="[!$([NH]!@C(=O))&!D1&!$(*#*)]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)]",
Expand Down Expand Up @@ -243,10 +192,6 @@ def add_fragment_arguments(parser):
help="Ignore fragmentations where one or more constant fragments are very small (default: %r)"
% (OPTS.min_heavies_per_const_frag,))

## p.add_argument("--method", choices=("dalke", "hussain"), type=fragment_io.parse_method_value,
## help="fragment canonicalization method to use (default: %s)"
## % (FRAGMENT_OPTIONS.method,))

###### Index

parse_min_variable_heavies_value = nonnegative_int
Expand Down
23 changes: 13 additions & 10 deletions mmpdblib/do_fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from . import reporters
from . import fileio
from . import fragment_algorithm
from . import fragment_io
from . import fragment_db
from . import fragment_types
from . import smarts_aliases

Expand Down Expand Up @@ -175,7 +175,7 @@ def get(name):
if min_heavies_per_const_frag == "none":
min_heavies_per_const_frag = 0

return specified_args, config.FragmentOptions(
return specified_args, fragment_types.FragmentOptions(
max_heavies = max_heavies,
max_rotatable_bonds = max_rotatable_bonds,
rotatable_smarts =rotatable_smarts,
Expand Down Expand Up @@ -535,6 +535,12 @@ def fragment_command(parser, args):
except ValueError as err:
sys.stderr.write(str(err) + "\n")
raise SystemExit(1)

structure_filename = args.structure_filename
output_filename = args.output
if output_filename is None:
output_filename = fileio.remove_suffixes(structure_filename) + ".fragdb"
reporter.report(f"Using {output_filename!r} as the default --output file.")

# Use a cache?
cache = None
Expand All @@ -546,7 +552,7 @@ def fragment_command(parser, args):
raise AssertionError("Should not get here")

try:
cache = fragment_io.load_cache(args.cache, reporter)
cache = fragment_db.load_cache(args.cache, reporter)
except IOError as err:
parser.error("Cannot open cache: %s" % (err,))
except ValueError as err:
Expand All @@ -573,9 +579,9 @@ def fragment_command(parser, args):
try:
with fileio.read_smiles_file(args.structure_filename, args.format,
args.delimiter, args.has_header) as reader:
with fragment_io.open_fragment_writer(filename = args.output,
with fragment_db.open_fragment_writer(output_filename,
options = fragment_filter.options,
format_hint = args.out) as writer:
) as writer:
records = make_fragment_records(reader, fragment_filter, cache,
pool=pool, reporter=reporter)
writer.write_records(records)
Expand All @@ -597,7 +603,7 @@ def fragment_command(parser, args):
reporter.update("")

## with fileio.open_output(args.output, args.out) as outfile:
## fragment_io.write_fragment_records(outfile, records, )
## fragment_db.write_fragment_records(outfile, records, )

def smifrag_command(parser, args):
reporter = command_support.get_reporter(args.quiet)
Expand All @@ -611,9 +617,6 @@ def smifrag_command(parser, args):
if record.errmsg:
parser.error("Cannot parse --smiles: %s" % (record.errmsg,))

#writer = fragment_io.FragInfoWriter(None, sys.stdout, None)
#writer.write_records([record])

columns = [["#cuts"], ["enum.label"],
["#heavies"], ["symm.class"], ["smiles"],
["order"],
Expand All @@ -624,7 +627,7 @@ def smifrag_command(parser, args):
"right", "center", "left", "left"]

has_rows = False
for frag in record.fragments:
for frag in record.fragmentations:
has_rows = True
items = [str(frag.num_cuts),
frag.enumeration_label,
Expand Down
15 changes: 8 additions & 7 deletions mmpdblib/do_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

from . import command_support
from . import index_algorithm
from . import fragment_io
from . import fragment_db
from . import fragment_types
from . import properties_io
from ._compat import open_universal
Expand Down Expand Up @@ -151,23 +151,24 @@ def index_command(parser, args):
selected_ids = set(properties.get_ids())
end_properties_memory = get_memory_use()

fragment_filename = args.fragment_filename
if fragment_filename is None:
# Not specified so use the default name.
# XXX warning message?
fragment_filename = "input.fragdb"

if (args.out is None or args.out == "mmpdb") and args.output is None:
# Use the filename based on the fragments filename
fragment_filename = args.fragment_filename
if fragment_filename is None:
parser.error("The '--out mmpdb' format requires a filename when reading from stdin.")

# replace the extension (if any) with ".mmpdb"
args.output = os.path.splitext(fragment_filename)[0] + ".mmpdb"
reporter.warning("No --output filename specified. Saving to %r." % (args.output,))

#reporter.report("Using fragment filters: %s" % (fragment_filter.get_args(),))

fragment_io.suggest_faster_json(reporter)

start_fragment_index_memory = get_memory_use()
try:
fragment_reader = fragment_io.read_fragment_records(args.fragment_filename)
fragment_reader = fragment_db.read_fragment_records(fragment_filename)
except fragment_types.FragmentFormatError as err:
parser.error(str(err))

Expand Down
15 changes: 14 additions & 1 deletion mmpdblib/fileio.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import sys
import gzip
#import io
import os

from ._compat import basestring, open_universal, io_wrapper
from ._compat import binary_stdin, binary_stdout
Expand Down Expand Up @@ -469,7 +469,20 @@ def _read_to_eol(infile, close, first_lineno, location):
"native": _read_whitespace,
}

def remove_suffixes(filename):
"""Remove .gz (if present) then any additional suffix

Used to get the '/a/x' in '/a/x.smi.gz' or '/a/x.smi'.
"""
left, ext = os.path.splitext(filename)
if ext.lower() == ".gz":
left, ext = os.path.splitext(left)
if not left:
# Handle odd names like ".smi", ".gz, and ".smi.gz".
return "input"
return left


def read_smiles_file(filename, format=None, delimiter="whitespace", has_header=False):
if format is None:
if filename is not None and filename.lower().endswith(".gz"):
Expand Down
34 changes: 1 addition & 33 deletions mmpdblib/fragment_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from rdkit import Chem
import itertools
from . import smiles_syntax # for validation
from .fragment_types import Fragmentation

#####

Expand All @@ -47,39 +48,6 @@ class EnumerationLabel(object):
VARIABLE_UP_ENUMERATION = "V"


class Fragmentation(object):
__slots__ = ("num_cuts", "enumeration_label",
"variable_num_heavies", "variable_symmetry_class", "variable_smiles",
"attachment_order",
"constant_num_heavies", "constant_symmetry_class", "constant_smiles", "constant_with_H_smiles")
def __init__(self,
num_cuts, enumeration_label,
variable_num_heavies, variable_symmetry_class, variable_smiles,
attachment_order,
constant_num_heavies, constant_symmetry_class, constant_smiles, constant_with_H_smiles):
self.num_cuts = num_cuts
self.enumeration_label = enumeration_label
self.variable_num_heavies = variable_num_heavies
self.variable_symmetry_class = variable_symmetry_class
self.variable_smiles = variable_smiles
self.attachment_order = attachment_order
self.constant_num_heavies = constant_num_heavies
self.constant_symmetry_class = constant_symmetry_class
self.constant_smiles = constant_smiles
self.constant_with_H_smiles = constant_with_H_smiles

def __repr__(self):
return ("Fragmentation({self.num_cuts}, {self.enumeration_label!r}, "
"{self.variable_num_heavies}, {self.variable_symmetry_class!r}, {self.variable_smiles!r}, "
"{self.attachment_order!r}, "
"{self.constant_num_heavies}, {self.constant_symmetry_class!r}, "
"{self.constant_smiles!r}, {self.constant_with_H_smiles!r})").format(
self=self)

def get_unique_key(self):
return "%s.%s.%s" % (self.attachment_order, self.variable_smiles, self.constant_smiles)


#####

# TODO: Move some of these into smiles_syntax.py
Expand Down
Loading