Fixes CI errors.

hmcezar · Sep 20, 2023 · 12caf0c · 12caf0c
1 parent 92e8d34
commit 12caf0c
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 59 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
-# clusstraj - A Python package for clustering similar structures from molecular simulations
-This Python script receives a molecular dynamics or Monte Carlo trajectory (in .pdb, .xyz or any format supported by OpenBabel), finds the minimum RMSD between the structures with the Kabsch algorithm and performs agglomerative clustering (a kind of unsupervised machine learning) to classify similar conformations. 
+# ClustTraj - A Python package for clustering similar structures from molecular simulations
+This Python package receives a molecular dynamics or Monte Carlo trajectory (in .pdb, .xyz or any format supported by OpenBabel), finds the minimum RMSD between the structures with the Kabsch algorithm and performs agglomerative clustering (a kind of unsupervised machine learning) to classify similar conformations. 
 
 What the script does is to calculate the distance (using the minimum RMSD) between each configuration of the trajectory, building a distance matrix (stored in the condensed form).
 Different strategies can be used in order to compute distances that correspond to the expected minimum RMSD, such as atom reordering or stepwise alignments.

diff --git a/clusttraj/__init__.pyc b/clusttraj/__init__.pyc
diff --git a/clusttraj/classify.py b/clusttraj/classify.py
@@ -6,29 +6,29 @@
 from .io import ClustOptions, Logger
 
 
-def silh_score(X, Z, dstep=0.1):
+def classify_structures_silhouette(
+    clust_opt: ClustOptions, distmat: np.ndarray, dstep: float = 0.1
+) -> Tuple[np.float64, np.ndarray, np.ndarray]:
     """
-    Find the optimal threshold following the silhouette score metric.
+    Find the optimal threshold following the silhouette score metric and perform the classification.
 
-    :param X: Array with initial data (distmat matrix)
-    :type X: numpy.ndarray
-
-    :param Z: 'Z' matrix from the scipy.cluster.hierarchy.linkage() method
-    :type Z: numpy.ndarray
-
-    :param dstep: Interval between threshold values, defaults to 0.1
-    :type dstep: float
+    Args:
+        X (np.ndarray): Array with initial data (distmat matrix)
+        dstep (float, optional): Interval between threshold values, defaults to 0.1
 
-    :return: The optimal silhouette score value 
-             :rtype: numpy.float64
+    Returns:
+        Tuple[np.float64, np.ndarray, np.ndarray]:
+            - The optimal silhouette score value
+            - An array with degenerated threshold values that yield the same optimal score
+            - An array with the cluster's labels from each optimal score
+    """
 
-             An array with degenerated threshold values that yield the same optimal score
-             :rtype: numpy.ndarray
+    # linkage
+    Logger.logger.info(
+        f"Clustering using '{clust_opt.method}' method to join the clusters\n"
+    )
+    Z = hcl.linkage(distmat, clust_opt.method, optimal_ordering=clust_opt.opt_order)
 
-             An array with the cluster's labels from each optimal score
-             :rtype: numpy.ndarray
-    """
-
     # Initialize the score and threshold
     ss_opt = np.float64()
     t_opt = np.array([])
@@ -39,10 +39,12 @@ def silh_score(X, Z, dstep=0.1):
 
     for t in t_range:
         # Create an array with cluster labels
-        hcl_labels = hcl.fcluster(Z, t=t, criterion='distance')
-        
+        hcl_labels = hcl.fcluster(Z, t=t, criterion="distance")
+
         # Compute the silhouette score
-        ss = metrics.silhouette_score(X, hcl_labels, metric='precomputed')
+        ss = metrics.silhouette_score(
+            squareform(distmat), hcl_labels, metric="precomputed"
+        )
 
         # Check for degeneracy for the optimal threshold value
         if np.any(ss == ss_opt):
@@ -56,7 +58,27 @@ def silh_score(X, Z, dstep=0.1):
             t_opt = t
             labels_opt = hcl_labels
 
-    return np.round(ss_opt, 3), np.round(t_opt, 3), labels_opt
+    Logger.logger.info(f"Highest silhouette score: {ss_opt}")
+
+    if t_opt.size > 1:
+        t_opt_str = ", ".join([str(t) for t in t_opt])
+        Logger.logger.info(
+            f"The following RMSD threshold values yielded the same optimial silhouette score: {t_opt_str}"
+        )
+        Logger.logger.info(f"The smallest RMSD of {t_opt[0]} has been adopted")
+        clusters = labels_opt[0]
+    else:
+        Logger.logger.info(f"Optimal RMSD threshold value: {t_opt}")
+        clusters = labels_opt
+
+    clust_opt.update({"optimal_cut": t_opt})
+
+    Logger.logger.info(
+        f"Saving clustering classification to {clust_opt.out_clust_name}\n"
+    )
+    np.savetxt(clust_opt.out_clust_name, clusters, fmt="%d")
+
+    return Z, clusters
 
 
 def classify_structures(
@@ -77,30 +99,12 @@ def classify_structures(
     )
     Z = hcl.linkage(distmat, clust_opt.method, optimal_ordering=clust_opt.opt_order)
 
-    if clust_opt.silhouette_score:
-        ss_opt, t_opt, labels_opt = silh_score(squareform(distmat), Z)
-
-        print("Highest silhouette score: %s" % ss_opt)
-
-        if t_opt.size > 1:
-            Logger.logger.info("The following RMSD threshold values yielded the same optimial silhouette score: %s" % t_opt)
-            Logger.logger.info("Clusters labels for each threshold: %s" % labels_opt)
-            Logger.logger.info("The smallest RMSD of {} has been adopted with the corresponding labels: {}".format(t_opt[0], labels_opt[0]))
-            clusters = labels_opt[0]
+    # build the clusters
+    clusters = hcl.fcluster(Z, clust_opt.min_rmsd, criterion="distance")
 
-        else:
-            Logger.logger.info("Optimal RMSD threshold value: %s" % t_opt)
-            clusters = labels_opt
-
-        return Z, clusters, t_opt
-
-    else:
-        # build the clusters
-        clusters = hcl.fcluster(Z, clust_opt.min_rmsd, criterion='distance')
-
-        Logger.logger.info(
-            f"Saving clustering classification to {clust_opt.out_clust_name}\n"
-        )
-        np.savetxt(clust_opt.out_clust_name, clusters, fmt="%d")
+    Logger.logger.info(
+        f"Saving clustering classification to {clust_opt.out_clust_name}\n"
+    )
+    np.savetxt(clust_opt.out_clust_name, clusters, fmt="%d")
 
-        return Z, clusters
+    return Z, clusters
diff --git a/clusttraj/io.py b/clusttraj/io.py
@@ -48,6 +48,12 @@ class ClustOptions:
     summary_name: str = None
     solute_natoms: int = None
     reorder_excl: np.ndarray = None
+    optimal_cut: np.ndarray = None
+
+    def update(self, new):
+        for key, value in new.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
 
     def __str__(self):
         """Return a string representation of the ClustOptions object.
@@ -59,7 +65,11 @@ def __str__(self):
 
         # main parameters
         return_str += f"\n\nClusterized from trajectory file: {self.trajfile}\n"
-        return_str += f"Method: {self.method}\nRMSD criterion: {self.min_rmsd}\n"
+        return_str += f"Method: {self.method}\n"
+        if self.silhouette_score:
+            return_str += "\n Using "
+        else:
+            return_str += "RMSD criterion: {self.min_rmsd}\n"
         return_str += f"Ignoring hydrogens?: {self.no_hydrogen}\n"
 
         # reordering options
@@ -274,7 +284,7 @@ def configure_runtime(args_in: List[str]) -> ClustOptions:
         "-ss",
         "--silhouette-score",
         action="store_true",
-        help="use the silhouette score to determine the optimal number of clusters"
+        help="use the silhouette score to determine the optimal number of clusters",
     )
     parser.add_argument(
         "--log",
@@ -525,7 +535,7 @@ def parse_args(args: argparse.Namespace) -> ClustOptions:
         "opt_order": args.optimal_ordering,
         "overwrite": args.force,
         "final_kabsch": args.final_kabsch,
-        "silhouette_score" = args.silhouette_score,
+        "silhouette_score": args.silhouette_score,
     }
 
     if args.reorder:

diff --git a/clusttraj/main.py b/clusttraj/main.py
@@ -25,7 +25,7 @@
 from .io import Logger, configure_runtime, save_clusters_config
 from .distmat import get_distmat
 from .plot import plot_clust_evo, plot_dendrogram, plot_mds
-from .classify import classify_structures
+from .classify import classify_structures, classify_structures_silhouette
 
 
 def main(args=None) -> None:
@@ -48,7 +48,7 @@ def main(args=None) -> None:
 
     # perform the clustering
     if clust_opt.silhouette_score:
-        Z, clusters, t_opt = classify_structures(clust_opt, distmat)
+        Z, clusters, t_opt = classify_structures_silhouette(clust_opt, distmat)
     else:
         Z, clusters = classify_structures(clust_opt, distmat)
 

diff --git a/clusttraj/plot.py b/clusttraj/plot.py
@@ -24,7 +24,7 @@ def plot_clust_evo(clust_opt: ClustOptions, clusters: np.ndarray) -> None:
     plt.savefig(clust_opt.evo_name, bbox_inches="tight")
 
 
-def plot_dendrogram(clust_opt: ClustOptions, Z: np.ndarray, t_opt = None) -> None:
+def plot_dendrogram(clust_opt: ClustOptions, Z: np.ndarray) -> None:
     """Plot a dendrogram based on hierarchical clustering.
 
     Parameters:
@@ -48,10 +48,8 @@ def plot_dendrogram(clust_opt: ClustOptions, Z: np.ndarray, t_opt = None) -> Non
 
     # Add a horizontal line at the minimum RMSD value
     if clust_opt.silhouette_score:
-        try: 
-            plt.axhline(t_opt, linestyle="--")
-        except:
-            plt.axhline(t_opt[0], linestyle="--")
+        for m_rmsd in clust_opt.optimal_cut:
+            plt.axhline(m_rmsd, linestyle="--")
     else:
         plt.axhline(clust_opt.min_rmsd, linestyle="--")