Skip to content

Commit

Permalink
Fixes CI errors.
Browse files Browse the repository at this point in the history
  • Loading branch information
hmcezar committed Sep 20, 2023
1 parent 92e8d34 commit 12caf0c
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 59 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# clusstraj - A Python package for clustering similar structures from molecular simulations
This Python script receives a molecular dynamics or Monte Carlo trajectory (in .pdb, .xyz or any format supported by OpenBabel), finds the minimum RMSD between the structures with the Kabsch algorithm and performs agglomerative clustering (a kind of unsupervised machine learning) to classify similar conformations.
# ClustTraj - A Python package for clustering similar structures from molecular simulations
This Python package receives a molecular dynamics or Monte Carlo trajectory (in .pdb, .xyz or any format supported by OpenBabel), finds the minimum RMSD between the structures with the Kabsch algorithm and performs agglomerative clustering (a kind of unsupervised machine learning) to classify similar conformations.

What the script does is to calculate the distance (using the minimum RMSD) between each configuration of the trajectory, building a distance matrix (stored in the condensed form).
Different strategies can be used in order to compute distances that correspond to the expected minimum RMSD, such as atom reordering or stepwise alignments.
Expand Down
Binary file removed clusttraj/__init__.pyc
Binary file not shown.
98 changes: 51 additions & 47 deletions clusttraj/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,29 @@
from .io import ClustOptions, Logger


def silh_score(X, Z, dstep=0.1):
def classify_structures_silhouette(
clust_opt: ClustOptions, distmat: np.ndarray, dstep: float = 0.1
) -> Tuple[np.float64, np.ndarray, np.ndarray]:
"""
Find the optimal threshold following the silhouette score metric.
Find the optimal threshold following the silhouette score metric and perform the classification.
:param X: Array with initial data (distmat matrix)
:type X: numpy.ndarray
:param Z: 'Z' matrix from the scipy.cluster.hierarchy.linkage() method
:type Z: numpy.ndarray
:param dstep: Interval between threshold values, defaults to 0.1
:type dstep: float
Args:
X (np.ndarray): Array with initial data (distmat matrix)
dstep (float, optional): Interval between threshold values, defaults to 0.1
:return: The optimal silhouette score value
:rtype: numpy.float64
Returns:
Tuple[np.float64, np.ndarray, np.ndarray]:
- The optimal silhouette score value
- An array with degenerated threshold values that yield the same optimal score
- An array with the cluster's labels from each optimal score
"""

An array with degenerated threshold values that yield the same optimal score
:rtype: numpy.ndarray
# linkage
Logger.logger.info(
f"Clustering using '{clust_opt.method}' method to join the clusters\n"
)
Z = hcl.linkage(distmat, clust_opt.method, optimal_ordering=clust_opt.opt_order)

An array with the cluster's labels from each optimal score
:rtype: numpy.ndarray
"""

# Initialize the score and threshold
ss_opt = np.float64()
t_opt = np.array([])
Expand All @@ -39,10 +39,12 @@ def silh_score(X, Z, dstep=0.1):

for t in t_range:
# Create an array with cluster labels
hcl_labels = hcl.fcluster(Z, t=t, criterion='distance')
hcl_labels = hcl.fcluster(Z, t=t, criterion="distance")

# Compute the silhouette score
ss = metrics.silhouette_score(X, hcl_labels, metric='precomputed')
ss = metrics.silhouette_score(
squareform(distmat), hcl_labels, metric="precomputed"
)

# Check for degeneracy for the optimal threshold value
if np.any(ss == ss_opt):
Expand All @@ -56,7 +58,27 @@ def silh_score(X, Z, dstep=0.1):
t_opt = t
labels_opt = hcl_labels

return np.round(ss_opt, 3), np.round(t_opt, 3), labels_opt
Logger.logger.info(f"Highest silhouette score: {ss_opt}")

if t_opt.size > 1:
t_opt_str = ", ".join([str(t) for t in t_opt])
Logger.logger.info(
f"The following RMSD threshold values yielded the same optimial silhouette score: {t_opt_str}"
)
Logger.logger.info(f"The smallest RMSD of {t_opt[0]} has been adopted")
clusters = labels_opt[0]
else:
Logger.logger.info(f"Optimal RMSD threshold value: {t_opt}")
clusters = labels_opt

clust_opt.update({"optimal_cut": t_opt})

Logger.logger.info(
f"Saving clustering classification to {clust_opt.out_clust_name}\n"
)
np.savetxt(clust_opt.out_clust_name, clusters, fmt="%d")

return Z, clusters


def classify_structures(
Expand All @@ -77,30 +99,12 @@ def classify_structures(
)
Z = hcl.linkage(distmat, clust_opt.method, optimal_ordering=clust_opt.opt_order)

if clust_opt.silhouette_score:
ss_opt, t_opt, labels_opt = silh_score(squareform(distmat), Z)

print("Highest silhouette score: %s" % ss_opt)

if t_opt.size > 1:
Logger.logger.info("The following RMSD threshold values yielded the same optimial silhouette score: %s" % t_opt)
Logger.logger.info("Clusters labels for each threshold: %s" % labels_opt)
Logger.logger.info("The smallest RMSD of {} has been adopted with the corresponding labels: {}".format(t_opt[0], labels_opt[0]))
clusters = labels_opt[0]
# build the clusters
clusters = hcl.fcluster(Z, clust_opt.min_rmsd, criterion="distance")

else:
Logger.logger.info("Optimal RMSD threshold value: %s" % t_opt)
clusters = labels_opt

return Z, clusters, t_opt

else:
# build the clusters
clusters = hcl.fcluster(Z, clust_opt.min_rmsd, criterion='distance')

Logger.logger.info(
f"Saving clustering classification to {clust_opt.out_clust_name}\n"
)
np.savetxt(clust_opt.out_clust_name, clusters, fmt="%d")
Logger.logger.info(
f"Saving clustering classification to {clust_opt.out_clust_name}\n"
)
np.savetxt(clust_opt.out_clust_name, clusters, fmt="%d")

return Z, clusters
return Z, clusters
16 changes: 13 additions & 3 deletions clusttraj/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ class ClustOptions:
summary_name: str = None
solute_natoms: int = None
reorder_excl: np.ndarray = None
optimal_cut: np.ndarray = None

def update(self, new):
for key, value in new.items():
if hasattr(self, key):
setattr(self, key, value)

def __str__(self):
"""Return a string representation of the ClustOptions object.
Expand All @@ -59,7 +65,11 @@ def __str__(self):

# main parameters
return_str += f"\n\nClusterized from trajectory file: {self.trajfile}\n"
return_str += f"Method: {self.method}\nRMSD criterion: {self.min_rmsd}\n"
return_str += f"Method: {self.method}\n"
if self.silhouette_score:
return_str += "\n Using "
else:
return_str += "RMSD criterion: {self.min_rmsd}\n"
return_str += f"Ignoring hydrogens?: {self.no_hydrogen}\n"

# reordering options
Expand Down Expand Up @@ -274,7 +284,7 @@ def configure_runtime(args_in: List[str]) -> ClustOptions:
"-ss",
"--silhouette-score",
action="store_true",
help="use the silhouette score to determine the optimal number of clusters"
help="use the silhouette score to determine the optimal number of clusters",
)
parser.add_argument(
"--log",
Expand Down Expand Up @@ -525,7 +535,7 @@ def parse_args(args: argparse.Namespace) -> ClustOptions:
"opt_order": args.optimal_ordering,
"overwrite": args.force,
"final_kabsch": args.final_kabsch,
"silhouette_score" = args.silhouette_score,
"silhouette_score": args.silhouette_score,
}

if args.reorder:
Expand Down
4 changes: 2 additions & 2 deletions clusttraj/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from .io import Logger, configure_runtime, save_clusters_config
from .distmat import get_distmat
from .plot import plot_clust_evo, plot_dendrogram, plot_mds
from .classify import classify_structures
from .classify import classify_structures, classify_structures_silhouette


def main(args=None) -> None:
Expand All @@ -48,7 +48,7 @@ def main(args=None) -> None:

# perform the clustering
if clust_opt.silhouette_score:
Z, clusters, t_opt = classify_structures(clust_opt, distmat)
Z, clusters, t_opt = classify_structures_silhouette(clust_opt, distmat)
else:
Z, clusters = classify_structures(clust_opt, distmat)

Expand Down
8 changes: 3 additions & 5 deletions clusttraj/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def plot_clust_evo(clust_opt: ClustOptions, clusters: np.ndarray) -> None:
plt.savefig(clust_opt.evo_name, bbox_inches="tight")


def plot_dendrogram(clust_opt: ClustOptions, Z: np.ndarray, t_opt = None) -> None:
def plot_dendrogram(clust_opt: ClustOptions, Z: np.ndarray) -> None:
"""Plot a dendrogram based on hierarchical clustering.
Parameters:
Expand All @@ -48,10 +48,8 @@ def plot_dendrogram(clust_opt: ClustOptions, Z: np.ndarray, t_opt = None) -> Non

# Add a horizontal line at the minimum RMSD value
if clust_opt.silhouette_score:
try:
plt.axhline(t_opt, linestyle="--")
except:
plt.axhline(t_opt[0], linestyle="--")
for m_rmsd in clust_opt.optimal_cut:
plt.axhline(m_rmsd, linestyle="--")
else:
plt.axhline(clust_opt.min_rmsd, linestyle="--")

Expand Down

0 comments on commit 12caf0c

Please sign in to comment.