From 12a2fad8ded44cb460aef2c68be14ff65ee06887 Mon Sep 17 00:00:00 2001 From: Henrique Musseli Cezar Date: Thu, 21 Sep 2023 14:50:53 +0200 Subject: [PATCH] Several small fixes and improvements to code and documentation. --- clusttraj/classify.py | 16 ++++++++-------- clusttraj/distmat.py | 3 +++ clusttraj/io.py | 30 +++++++++++++++++++++++------- clusttraj/main.py | 33 +++++++-------------------------- clusttraj/plot.py | 2 ++ clusttraj/utils.py | 2 ++ docs/source/conf.py | 11 +++++++++-- docs/source/install.rst | 25 ++++++++++++++++++++++++- docs/source/modules.rst | 7 ------- test/conftest.py | 1 + test/test_io.py | 3 +++ 11 files changed, 82 insertions(+), 51 deletions(-) delete mode 100644 docs/source/modules.rst diff --git a/clusttraj/classify.py b/clusttraj/classify.py index 5c3bace..e97434d 100644 --- a/clusttraj/classify.py +++ b/clusttraj/classify.py @@ -1,3 +1,5 @@ +"""Functions to perform clustering based on the distance matrix.""" + import scipy.cluster.hierarchy as hcl from scipy.spatial.distance import squareform from sklearn import metrics @@ -8,19 +10,17 @@ def classify_structures_silhouette( clust_opt: ClustOptions, distmat: np.ndarray, dstep: float = 0.1 -) -> Tuple[np.float64, np.ndarray, np.ndarray]: - """ - Find the optimal threshold following the silhouette score metric and perform the classification. +) -> Tuple[np.ndarray, np.ndarray]: + """Find the optimal threshold following the silhouette score metric and + perform the classification. Args: - X (np.ndarray): Array with initial data (distmat matrix) + clust_opt: The clustering options. + distmat: The distance matrix. dstep (float, optional): Interval between threshold values, defaults to 0.1 Returns: - Tuple[np.float64, np.ndarray, np.ndarray]: - - The optimal silhouette score value - - An array with degenerated threshold values that yield the same optimal score - - An array with the cluster's labels from each optimal score + A tuple containing the linkage matrix and the clusters. """ # linkage diff --git a/clusttraj/distmat.py b/clusttraj/distmat.py index 8f0297b..6b4e633 100644 --- a/clusttraj/distmat.py +++ b/clusttraj/distmat.py @@ -1,3 +1,6 @@ +"""Functions to compute the distance matrix based on the provided +trajectory.""" + from openbabel import pybel import numpy as np import rmsd diff --git a/clusttraj/io.py b/clusttraj/io.py index 2a51893..bcb2e48 100644 --- a/clusttraj/io.py +++ b/clusttraj/io.py @@ -1,3 +1,6 @@ +"""Input parsing, output information and a class to store the options for +clustering.""" + from openbabel import pybel from openbabel import openbabel from scipy.spatial.distance import squareform @@ -20,6 +23,8 @@ @dataclass class ClustOptions: + """Class to store the options for clustering.""" + trajfile: str = None min_rmsd: float = None n_workers: int = None @@ -50,12 +55,20 @@ class ClustOptions: reorder_excl: np.ndarray = None optimal_cut: np.ndarray = None - def update(self, new): + def update(self, new: dict) -> None: + """Update the instance with new values. + + Args: + new (dict): A dictionary containing the new values to update. + + Returns: + None + """ for key, value in new.items(): if hasattr(self, key): setattr(self, key, value) - def __str__(self): + def __str__(self) -> str: """Return a string representation of the ClustOptions object. Returns: @@ -67,9 +80,10 @@ def __str__(self): return_str += f"\n\nClusterized from trajectory file: {self.trajfile}\n" return_str += f"Method: {self.method}\n" if self.silhouette_score: - return_str += "\n Using " + return_str += "\nUsing silhouette score\n" + return_str += f"RMSD criterion found by silhouette: {self.optimal_cut[0]}\n" else: - return_str += "RMSD criterion: {self.min_rmsd}\n" + return_str += f"RMSD criterion: {self.min_rmsd}\n" return_str += f"Ignoring hydrogens?: {self.no_hydrogen}\n" # reordering options @@ -110,6 +124,8 @@ def __str__(self): class Logger: + """Logger class.""" + logformat = "%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] <%(funcName)s> %(message)s" formatter = logging.Formatter(fmt=logformat) logger = logging.getLogger(__name__) @@ -160,11 +176,11 @@ def extant_file(x: str) -> str: Args: x (str): The file path to check. - Returns: - str: The input file path if it exists. - Raises: argparse.ArgumentTypeError: If the file does not exist. + + Returns: + str: The input file path if it exists. """ if not os.path.exists(x): raise argparse.ArgumentTypeError(f"{x} does not exist") diff --git a/clusttraj/main.py b/clusttraj/main.py index 62612d8..e524522 100644 --- a/clusttraj/main.py +++ b/clusttraj/main.py @@ -1,24 +1,8 @@ -"""This script takes a trajectory and based on a minimal RMSD classify the -structures in clusters. +"""Main entry point for clusttraj. -The RMSD implementation using the Kabsch algorithm to superpose the molecules is taken from: https://github.com/charnley/rmsd -A very good description of the problem of superposition can be found at http://cnx.org/contents/HV-RsdwL@23/Molecular-Distance-Measures -A very good tutorial on hierachical clustering with scipy can be seen at https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/ -This script performs agglomerative clustering as suggested in https://stackoverflow.com/questions/31085393/hierarchical-clustering-a-pairwise-distance-matrix-of-precomputed-distances - -Author: Henrique Musseli Cezar -Date: NOV/2017 - - -TODO: - - [x] split this file into files (compute distance, cluster, plot, etc..) - - [x] add unit tests for the routines - - [ ] support coverage - - [x] check why clusttraj is not being made available when I pip install - - [ ] create conda package - - [x] update readme (also include installation instructions) - - [ ] upload package -""" # noqa: E501 +Can be called from command line or from an external library given a list +of arguments. +""" import sys import numpy as np @@ -28,7 +12,7 @@ from .classify import classify_structures, classify_structures_silhouette -def main(args=None) -> None: +def main(args: list = None) -> None: """Main function that performs clustering and generates output. Args: @@ -48,7 +32,7 @@ def main(args=None) -> None: # perform the clustering if clust_opt.silhouette_score: - Z, clusters, t_opt = classify_structures_silhouette(clust_opt, distmat) + Z, clusters = classify_structures_silhouette(clust_opt, distmat) else: Z, clusters = classify_structures(clust_opt, distmat) @@ -77,10 +61,7 @@ def main(args=None) -> None: if clust_opt.plot: plot_clust_evo(clust_opt, clusters) - if clust_opt.silhouette_score: - plot_dendrogram(clust_opt, Z, t_opt) - else: - plot_dendrogram(clust_opt, Z) + plot_dendrogram(clust_opt, Z) plot_mds(clust_opt, clusters, distmat) diff --git a/clusttraj/plot.py b/clusttraj/plot.py index 0262723..d8a8a05 100644 --- a/clusttraj/plot.py +++ b/clusttraj/plot.py @@ -1,3 +1,5 @@ +"""Functions to plot the obtained results.""" + from sklearn import manifold from scipy.spatial.distance import squareform import scipy.cluster.hierarchy as hcl diff --git a/clusttraj/utils.py b/clusttraj/utils.py index 3881f72..6b2d011 100644 --- a/clusttraj/utils.py +++ b/clusttraj/utils.py @@ -1,3 +1,5 @@ +"""Additional utility functions.""" + from openbabel import pybel import numpy as np from typing import Tuple diff --git a/docs/source/conf.py b/docs/source/conf.py index a700154..227fb62 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,7 +5,13 @@ import os import sys -sys.path.insert(0, os.path.abspath("../../clusttraj/")) +sys.path.insert(0, os.path.abspath("../..")) + +def get_release(): + with open("../../clusttraj/__init__.py", "r") as f: + for line in f: + if "__version__" in line: + return line.split("=")[1].strip().strip('"') # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information @@ -13,7 +19,8 @@ project = "ClustTraj" copyright = "2023, Henrique Musseli Cezar" author = "Henrique Musseli Cezar" -release = "1.0.0" +version = get_release() +release = get_release() # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/source/install.rst b/docs/source/install.rst index 9db41a0..ffe7153 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -3,11 +3,14 @@ Installing ClustTraj Installing ``clusttraj`` is simple and can be achieved using ``pip``: -.. code-block::bash +.. code-block:: console + pip install clusttraj This will automatically install the package and its dependencies. +We recommend performing the installation in an empty virtual environment. + Dependencies ************ @@ -22,3 +25,23 @@ Currently, the following dependencies are installed: * `SciPy `_ * `scikit-learn `_ * `matplotlib `_ + +Since the ``qml`` project development has been slow, we provide a fork repository in which +we updated the package to be installable in modern enviroments with newer versions of +Python and libraries. +This modified version can be downloaded and installed from `this link `_. + +Installation Problems +********************* + +If you have problems installing ``clusttraj`` because installing ``qml`` fails, try installing +``qml`` yourself first. +For Python 3.11, you might have to either disable setuptools distutils setting the environment +variable ``SETUPTOOLS_USE_DISTUTILS=stdlib`` before installing, or downgrading ``setuptools`` +to a version prior than 60.0. +For example, you could install ``qml`` with: + +.. code-block:: console + + pip install "setuptools<60" + pip install "qml @ git+https://github.com/hmcezar/qml@develop" \ No newline at end of file diff --git a/docs/source/modules.rst b/docs/source/modules.rst deleted file mode 100644 index 273238c..0000000 --- a/docs/source/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -clusttraj -========= - -.. toctree:: - :maxdepth: 4 - - clusttraj diff --git a/test/conftest.py b/test/conftest.py index 452dffa..99a7ad9 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -50,6 +50,7 @@ def options_dict(tmp_path): "solute_natoms": 17, "overwrite": True, "final_kabsch": False, + "silhouette_score": False, } return options_dict diff --git a/test/test_io.py b/test/test_io.py index 166a989..49187d7 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -42,6 +42,7 @@ def test_ClustOptions(options_dict): assert clust_opt.solute_natoms == 17 assert clust_opt.overwrite is True assert clust_opt.final_kabsch is False + assert clust_opt.silhouette_score is False def test_check_positive(): @@ -80,6 +81,7 @@ def test_parse_args(): optimal_ordering=True, force=True, final_kabsch=True, + silhouette_score=False, ) clust_opt = parse_args(args) @@ -104,6 +106,7 @@ def test_parse_args(): optimal_ordering=True, force=True, final_kabsch=True, + silhouette_score=False, ) clust_opt = parse_args(args)