diff --git a/cassiopeia/critique/critique_utilities.py b/cassiopeia/critique/critique_utilities.py index e5e5f38d..67bc67fd 100644 --- a/cassiopeia/critique/critique_utilities.py +++ b/cassiopeia/critique/critique_utilities.py @@ -35,7 +35,7 @@ def annotate_tree_depths(tree: CassiopeiaTree) -> None: in place. Args: - tree: An ete3 Tree + tree: A CassiopeiaTree Returns: A dictionary mapping depth to the list of nodes at that depth. @@ -70,7 +70,7 @@ def get_outgroup(tree: CassiopeiaTree, triplet: Tuple[str, str, str]) -> str: of the LCA from the number of shared ancestors. Args: - tree: CassiopeiaTree + tree: A CassiopeiaTree triplet: A tuple of three leaves constituting a triplet. Returns: @@ -97,21 +97,20 @@ def get_outgroup(tree: CassiopeiaTree, triplet: Tuple[str, str, str]) -> str: def sample_triplet_at_depth( tree: CassiopeiaTree, depth: int, depth_to_nodes: Optional[Dict[int, List[str]]] = None, -) -> Tuple[List[int], str]: +) -> Tuple[Tuple[str], str]: """Samples a triplet at a given depth. Samples a triplet of leaves such that the depth of the LCA of the triplet is at the specified depth. Args: - tree: CassiopeiaTree + tree: A CassiopeiaTree depth: Depth at which to sample the triplet depth_to_nodes: An optional dictionary that maps a depth to the nodes that appear at that depth. This speeds up the function considerably. Returns: - A list of three leaves corresponding to the triplet name of the outgroup - of the triplet. + The sampled triplet as well as the outgroup of the triplet """ if depth_to_nodes is None: @@ -121,12 +120,12 @@ def sample_triplet_at_depth( total_triplets = sum([tree.get_attribute(v, "number_of_triplets") for v in candidate_nodes]) - # sample a node from this depth with probability proportional to the number + # sample a node from this depth with probability proportional to the number # of triplets underneath it probs = [tree.get_attribute(v, "number_of_triplets") / total_triplets for v in candidate_nodes] node = np.random.choice(candidate_nodes, size=1, replace=False, p=probs)[0] - # Generate the probilities to sample each combination of 3 daughter clades + # Generate the probabilities to sample each combination of 3 daughter clades # to sample from, proportional to the number of triplets in each daughter # clade. Choices include all ways to choose 3 different daughter clades # or 2 from one daughter clade and one from another diff --git a/cassiopeia/data/utilities.py b/cassiopeia/data/utilities.py index 14ec0fba..1cc54dc9 100644 --- a/cassiopeia/data/utilities.py +++ b/cassiopeia/data/utilities.py @@ -85,7 +85,7 @@ def ete3_to_networkx(tree: ete3.Tree) -> nx.DiGraph: n.name = f"node{internal_node_iter}" internal_node_iter += 1 - g.add_edge(n.up.name, n.name) + g.add_edge(n.up.name.replace("'", ""), n.name.replace("'", "")) return g diff --git a/cassiopeia/solver/STDRSolver.py b/cassiopeia/solver/STDRSolver.py new file mode 100644 index 00000000..2de28f68 --- /dev/null +++ b/cassiopeia/solver/STDRSolver.py @@ -0,0 +1,43 @@ +from typing import Callable +import dendropy +import numpy as np +import ete3 +import spectraltree +from typing import Callable + +from cassiopeia.data import CassiopeiaTree +from cassiopeia.data import utilities as data_utilities +from cassiopeia.solver import CassiopeiaSolver + +class STDRSolver(CassiopeiaSolver.CassiopeiaSolver): + + def __init__( + self, + similarity_function: Callable[[np.array], np.array], + threshold: int=64, + min_split: int=1, + merge_method: str="least_square", + verbose: bool = False + ): + + self.similarity_function = similarity_function + self.threshold = threshold + self.min_split = min_split + self.merge_method = merge_method + self.verbose = verbose + + def solve(self, cassiopeia_tree: CassiopeiaTree) -> None: + character_matrix = cassiopeia_tree.get_current_character_matrix() + taxon_namespace = dendropy.TaxonNamespace(list(character_matrix.index)) + metadata = spectraltree.utils.TaxaMetadata(taxon_namespace, list(character_matrix.index), alphabet=None) + + stdr_nj = spectraltree.STDR(spectraltree.NeighborJoining, self.similarity_function) + + tree_stdr_nj = stdr_nj(character_matrix.values, + taxa_metadata= metadata, + threshold = self.threshold, + min_split = self.min_split, + merge_method = self.merge_method, + verbose=self.verbose) + newick_str = tree_stdr_nj.as_string(schema="newick") + cassiopeia_tree.populate_tree(newick_str) \ No newline at end of file diff --git a/cassiopeia/solver/__init__.py b/cassiopeia/solver/__init__.py index 46850ea8..b634b065 100755 --- a/cassiopeia/solver/__init__.py +++ b/cassiopeia/solver/__init__.py @@ -10,6 +10,7 @@ from .MaxCutSolver import MaxCutSolver from .NeighborJoiningSolver import NeighborJoiningSolver from .PercolationSolver import PercolationSolver +from .STDRSolver import STDRSolver from .SharedMutationJoiningSolver import SharedMutationJoiningSolver from .SpectralGreedySolver import SpectralGreedySolver from .SpectralSolver import SpectralSolver diff --git a/cassiopeia/solver/stdr_similarities.py b/cassiopeia/solver/stdr_similarities.py new file mode 100644 index 00000000..11b5b242 --- /dev/null +++ b/cassiopeia/solver/stdr_similarities.py @@ -0,0 +1,108 @@ +"""This file stores a variety of ways to calculate the similarity used for STDR with neighborjoining +""" + +import numpy as np +import scipy +import spectraltree + +def JC_hamming_sim(vals): + return spectraltree.JC_similarity_matrix(vals) + +def hamming_sim(vals): + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + return 1 - hamming_matrix + +def hamming_sim_ignore_missing(vals): + missing_val = -1 + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + missing_array = (vals==missing_val) + pdist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_and(u,v)))) + + return vals.shape[1] - hamming_matrix*vals.shape[1] - pdist + +def hamming_sim_ignore_missing_normalize_over_nonmissing(vals): + missing_val = -1 + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + missing_array = (vals==missing_val) + pdist_xor = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_xor(u,v)))) + pdist_or = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_or(u,v)))) + + ret_mat = 1 - (hamming_matrix*vals.shape[1] - pdist_xor) / (np.ones_like(hamming_matrix) * vals.shape[1] - pdist_or) + ret_mat[np.isnan(ret_mat)] = 0 + ret_mat[np.isinf(ret_mat)] = 0 + return ret_mat + +def hamming_sim_ignore_uncut(vals): + uncut_state = 0 + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + uncut_array = (vals==uncut_state) + pdist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(uncut_array, lambda u,v: np.sum(np.logical_and(u,v)))) + return vals.shape[1] - hamming_matrix*vals.shape[1] - pdist + +def hamming_sim_ignore_uncut_normalize_over_cut(vals): + uncut_state = 0 + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + uncut_array = (vals==uncut_state) + pdist_xor = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(uncut_array, lambda u,v: np.sum(np.logical_xor(u,v)))) + pdist_or = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(uncut_array, lambda u,v: np.sum(np.logical_or(u,v)))) + + ret_mat = 1 - (hamming_matrix*vals.shape[1] - pdist_xor) / (np.ones_like(hamming_matrix) * vals.shape[1] - pdist_or) + ret_mat[np.isnan(ret_mat)] = 0 + ret_mat[np.isinf(ret_mat)] = 0 + return ret_mat + +def hamming_sim_ignore_both(vals): + missing_val = -1 + uncut_state = 0 + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + missing_array = (vals==missing_val) + uncut_array = (vals==uncut_state) + missing_pdist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_and(u,v)))) + uncut_pdist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(uncut_array, lambda u,v: np.sum(np.logical_and(u,v)))) + return (1 - hamming_matrix)*vals.shape[1] - missing_pdist - uncut_pdist + +def hamming_sim_ignore_both_normalize_over_nonmissing(vals): + missing_val = -1 + uncut_state = 0 + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + missing_array = (vals==missing_val) + uncut_array = (vals==uncut_state) + missing_and = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_and(u,v)))) + missing_or = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_or(u,v)))) + uncut_and = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(uncut_array, lambda u,v: np.sum(np.logical_and(u,v)))) + + ret_mat = ((1 - hamming_matrix)*vals.shape[1] - missing_and - uncut_and) / (np.ones_like(hamming_matrix) * vals.shape[1] - missing_or) + ret_mat[np.isnan(ret_mat)] = 0 + ret_mat[np.isinf(ret_mat)] = 0 + return ret_mat + +def hamming_sim_ignore_both_normalize_over_cut(vals): + missing_val = -1 + uncut_state = 0 + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + missing_array = (vals==missing_val) + uncut_array = (vals==uncut_state) + missing_and = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_and(u,v)))) + uncut_or = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(uncut_array, lambda u,v: np.sum(np.logical_or(u,v)))) + uncut_and = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(uncut_array, lambda u,v: np.sum(np.logical_and(u,v)))) + + ret_mat = ((1 - hamming_matrix)*vals.shape[1] - missing_and - uncut_and) / (np.ones_like(hamming_matrix) * vals.shape[1] - uncut_or) + ret_mat[np.isnan(ret_mat)] = 0 + ret_mat[np.isinf(ret_mat)] = 0 + return ret_mat + +def hamming_sim_ignore_both_normalize_over_both(vals): + missing_val = -1 + uncut_state = 0 + hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming')) + missing_array = (vals==missing_val) + uncut_array = (vals==uncut_state) + either_array = missing_array + uncut_array + missing_and = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_and(u,v)))) + uncut_and = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(uncut_array, lambda u,v: np.sum(np.logical_and(u,v)))) + either_or = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(either_array, lambda u,v: np.sum(np.logical_or(u,v)))) + + ret_mat = np.divide((1 - hamming_matrix)*vals.shape[1] - missing_and - uncut_and, np.ones_like(hamming_matrix) * vals.shape[1] - either_or) + ret_mat[np.isnan(ret_mat)] = 0 + ret_mat[np.isinf(ret_mat)] = 0 + return ret_mat \ No newline at end of file