Skip to content

Commit

Permalink
Several small fixes and improvements
Browse files Browse the repository at this point in the history
to code and documentation.
  • Loading branch information
hmcezar committed Sep 21, 2023
1 parent 12caf0c commit 12a2fad
Show file tree
Hide file tree
Showing 11 changed files with 82 additions and 51 deletions.
16 changes: 8 additions & 8 deletions clusttraj/classify.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Functions to perform clustering based on the distance matrix."""

import scipy.cluster.hierarchy as hcl
from scipy.spatial.distance import squareform
from sklearn import metrics
Expand All @@ -8,19 +10,17 @@

def classify_structures_silhouette(
clust_opt: ClustOptions, distmat: np.ndarray, dstep: float = 0.1
) -> Tuple[np.float64, np.ndarray, np.ndarray]:
"""
Find the optimal threshold following the silhouette score metric and perform the classification.
) -> Tuple[np.ndarray, np.ndarray]:
"""Find the optimal threshold following the silhouette score metric and
perform the classification.
Args:
X (np.ndarray): Array with initial data (distmat matrix)
clust_opt: The clustering options.
distmat: The distance matrix.
dstep (float, optional): Interval between threshold values, defaults to 0.1
Returns:
Tuple[np.float64, np.ndarray, np.ndarray]:
- The optimal silhouette score value
- An array with degenerated threshold values that yield the same optimal score
- An array with the cluster's labels from each optimal score
A tuple containing the linkage matrix and the clusters.
"""

# linkage
Expand Down
3 changes: 3 additions & 0 deletions clusttraj/distmat.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""Functions to compute the distance matrix based on the provided
trajectory."""

from openbabel import pybel
import numpy as np
import rmsd
Expand Down
30 changes: 23 additions & 7 deletions clusttraj/io.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""Input parsing, output information and a class to store the options for
clustering."""

from openbabel import pybel
from openbabel import openbabel
from scipy.spatial.distance import squareform
Expand All @@ -20,6 +23,8 @@

@dataclass
class ClustOptions:
"""Class to store the options for clustering."""

trajfile: str = None
min_rmsd: float = None
n_workers: int = None
Expand Down Expand Up @@ -50,12 +55,20 @@ class ClustOptions:
reorder_excl: np.ndarray = None
optimal_cut: np.ndarray = None

def update(self, new):
def update(self, new: dict) -> None:
"""Update the instance with new values.
Args:
new (dict): A dictionary containing the new values to update.
Returns:
None
"""
for key, value in new.items():
if hasattr(self, key):
setattr(self, key, value)

def __str__(self):
def __str__(self) -> str:
"""Return a string representation of the ClustOptions object.
Returns:
Expand All @@ -67,9 +80,10 @@ def __str__(self):
return_str += f"\n\nClusterized from trajectory file: {self.trajfile}\n"
return_str += f"Method: {self.method}\n"
if self.silhouette_score:
return_str += "\n Using "
return_str += "\nUsing silhouette score\n"
return_str += f"RMSD criterion found by silhouette: {self.optimal_cut[0]}\n"
else:
return_str += "RMSD criterion: {self.min_rmsd}\n"
return_str += f"RMSD criterion: {self.min_rmsd}\n"
return_str += f"Ignoring hydrogens?: {self.no_hydrogen}\n"

# reordering options
Expand Down Expand Up @@ -110,6 +124,8 @@ def __str__(self):


class Logger:
"""Logger class."""

logformat = "%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] <%(funcName)s> %(message)s"
formatter = logging.Formatter(fmt=logformat)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -160,11 +176,11 @@ def extant_file(x: str) -> str:
Args:
x (str): The file path to check.
Returns:
str: The input file path if it exists.
Raises:
argparse.ArgumentTypeError: If the file does not exist.
Returns:
str: The input file path if it exists.
"""
if not os.path.exists(x):
raise argparse.ArgumentTypeError(f"{x} does not exist")
Expand Down
33 changes: 7 additions & 26 deletions clusttraj/main.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,8 @@
"""This script takes a trajectory and based on a minimal RMSD classify the
structures in clusters.
"""Main entry point for clusttraj.
The RMSD implementation using the Kabsch algorithm to superpose the molecules is taken from: https://github.com/charnley/rmsd
A very good description of the problem of superposition can be found at http://cnx.org/contents/HV-RsdwL@23/Molecular-Distance-Measures
A very good tutorial on hierachical clustering with scipy can be seen at https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
This script performs agglomerative clustering as suggested in https://stackoverflow.com/questions/31085393/hierarchical-clustering-a-pairwise-distance-matrix-of-precomputed-distances
Author: Henrique Musseli Cezar
Date: NOV/2017
TODO:
- [x] split this file into files (compute distance, cluster, plot, etc..)
- [x] add unit tests for the routines
- [ ] support coverage
- [x] check why clusttraj is not being made available when I pip install
- [ ] create conda package
- [x] update readme (also include installation instructions)
- [ ] upload package
""" # noqa: E501
Can be called from command line or from an external library given a list
of arguments.
"""

import sys
import numpy as np
Expand All @@ -28,7 +12,7 @@
from .classify import classify_structures, classify_structures_silhouette


def main(args=None) -> None:
def main(args: list = None) -> None:
"""Main function that performs clustering and generates output.
Args:
Expand All @@ -48,7 +32,7 @@ def main(args=None) -> None:

# perform the clustering
if clust_opt.silhouette_score:
Z, clusters, t_opt = classify_structures_silhouette(clust_opt, distmat)
Z, clusters = classify_structures_silhouette(clust_opt, distmat)
else:
Z, clusters = classify_structures(clust_opt, distmat)

Expand Down Expand Up @@ -77,10 +61,7 @@ def main(args=None) -> None:
if clust_opt.plot:
plot_clust_evo(clust_opt, clusters)

if clust_opt.silhouette_score:
plot_dendrogram(clust_opt, Z, t_opt)
else:
plot_dendrogram(clust_opt, Z)
plot_dendrogram(clust_opt, Z)

plot_mds(clust_opt, clusters, distmat)

Expand Down
2 changes: 2 additions & 0 deletions clusttraj/plot.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Functions to plot the obtained results."""

from sklearn import manifold
from scipy.spatial.distance import squareform
import scipy.cluster.hierarchy as hcl
Expand Down
2 changes: 2 additions & 0 deletions clusttraj/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Additional utility functions."""

from openbabel import pybel
import numpy as np
from typing import Tuple
Expand Down
11 changes: 9 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,22 @@
import os
import sys

sys.path.insert(0, os.path.abspath("../../clusttraj/"))
sys.path.insert(0, os.path.abspath("../.."))

def get_release():
with open("../../clusttraj/__init__.py", "r") as f:
for line in f:
if "__version__" in line:
return line.split("=")[1].strip().strip('"')

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

project = "ClustTraj"
copyright = "2023, Henrique Musseli Cezar"
author = "Henrique Musseli Cezar"
release = "1.0.0"
version = get_release()
release = get_release()

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand Down
25 changes: 24 additions & 1 deletion docs/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@ Installing ClustTraj

Installing ``clusttraj`` is simple and can be achieved using ``pip``:

.. code-block::bash
.. code-block:: console
pip install clusttraj
This will automatically install the package and its dependencies.

We recommend performing the installation in an empty virtual environment.

Dependencies
************

Expand All @@ -22,3 +25,23 @@ Currently, the following dependencies are installed:
* `SciPy <https://www.scipy.org/>`_
* `scikit-learn <http://scikit-learn.org/stable/index.html>`_
* `matplotlib <https://matplotlib.org/>`_

Since the ``qml`` project development has been slow, we provide a fork repository in which
we updated the package to be installable in modern enviroments with newer versions of
Python and libraries.
This modified version can be downloaded and installed from `this link <https://github.com/hmcezar/qml>`_.

Installation Problems
*********************

If you have problems installing ``clusttraj`` because installing ``qml`` fails, try installing
``qml`` yourself first.
For Python 3.11, you might have to either disable setuptools distutils setting the environment
variable ``SETUPTOOLS_USE_DISTUTILS=stdlib`` before installing, or downgrading ``setuptools``
to a version prior than 60.0.
For example, you could install ``qml`` with:

.. code-block:: console
pip install "setuptools<60"
pip install "qml @ git+https://github.com/hmcezar/qml@develop"
7 changes: 0 additions & 7 deletions docs/source/modules.rst

This file was deleted.

1 change: 1 addition & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def options_dict(tmp_path):
"solute_natoms": 17,
"overwrite": True,
"final_kabsch": False,
"silhouette_score": False,
}

return options_dict
Expand Down
3 changes: 3 additions & 0 deletions test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_ClustOptions(options_dict):
assert clust_opt.solute_natoms == 17
assert clust_opt.overwrite is True
assert clust_opt.final_kabsch is False
assert clust_opt.silhouette_score is False


def test_check_positive():
Expand Down Expand Up @@ -80,6 +81,7 @@ def test_parse_args():
optimal_ordering=True,
force=True,
final_kabsch=True,
silhouette_score=False,
)
clust_opt = parse_args(args)

Expand All @@ -104,6 +106,7 @@ def test_parse_args():
optimal_ordering=True,
force=True,
final_kabsch=True,
silhouette_score=False,
)
clust_opt = parse_args(args)

Expand Down

0 comments on commit 12a2fad

Please sign in to comment.