Merge pull request #238 from YosefLab/lca-reconstruct

Lca reconstruct
YosefLab · Mar 8, 2024 · 41dbff8 · 41dbff8
2 parents bb2fd4f + dced953
commit 41dbff8
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 13 deletions.
diff --git a/cassiopeia/data/utilities.py b/cassiopeia/data/utilities.py
@@ -1,6 +1,7 @@
 """
 General utilities for the datasets encountered in Cassiopeia.
 """
+
 import collections
 from joblib import delayed
 import multiprocessing
@@ -61,7 +62,7 @@ def get_lca_characters(
             all_states = [
                 vec[i] for vec in vecs if vec[i] != missing_state_indicator
             ]
-            
+
             # this check is specifically if all_states consists of a single
             # ambiguous state.
             if len(list(set(all_states))) == 1:
@@ -72,6 +73,9 @@ def get_lca_characters(
                 else:
                     lca_vec[i] = all_states[0]
             else:
+                all_ambiguous = np.all(
+                    [is_ambiguous_state(s) for s in all_states]
+                )
                 chars = set.intersection(
                     *map(
                         set,
@@ -83,6 +87,10 @@ def get_lca_characters(
                 )
                 if len(chars) == 1:
                     lca_vec[i] = list(chars)[0]
+                if all_ambiguous:
+                    # if we only have ambiguous states, we set the LCA state
+                    # to be the intersection.
+                    lca_vec[i] = tuple(chars)
     return lca_vec
 
 
@@ -228,9 +236,7 @@ def compute_dissimilarity_map(
         ]
 
         # load character matrix into shared memory
-        shm = shared_memory.SharedMemory(
-            create=True, size=cm.nbytes
-        )
+        shm = shared_memory.SharedMemory(create=True, size=cm.nbytes)
         shared_cm = np.ndarray(cm.shape, dtype=cm.dtype, buffer=shm.buf)
         shared_cm[:] = cm[:]
 

diff --git a/test/data_tests/data_utilities_test.py b/test/data_tests/data_utilities_test.py
@@ -85,7 +85,7 @@ def test_bootstrap_character_matrices_no_priors(self):
 
         self.assertEqual(len(bootstrap_samples), 10)
 
-        for (bootstrap_matrix, bootstrap_priors) in bootstrap_samples:
+        for bootstrap_matrix, bootstrap_priors in bootstrap_samples:
             self.assertCountEqual(
                 self.character_matrix.index, bootstrap_matrix.index
             )
@@ -113,7 +113,7 @@ def test_bootstrap_character_matrices_with_priors(self):
 
         self.assertEqual(len(bootstrap_samples), 10)
 
-        for (bootstrap_matrix, bootstrap_priors) in bootstrap_samples:
+        for bootstrap_matrix, bootstrap_priors in bootstrap_samples:
             self.assertCountEqual(
                 self.character_matrix.index, bootstrap_matrix.index
             )
@@ -316,6 +316,37 @@ def test_lca_characters_ambiguous(self):
         )
         self.assertEqual(ret_vec, [1, 2, 3, 0, 5])
 
+    def test_lca_characters_ambiguous2(self):
+
+        s1 = [
+            (4, 62),
+            (3, 10),
+            (3, 10, 16),
+            (0, 3),
+            (0, 2, 3),
+            (0, 2, 3),
+            (0, 4, 7),
+            (0, 2, 23),
+            (0, 1, 4, 44),
+        ]
+        s2 = [4, 3, -1, 0, 0, 0, (0, 7), (0, 2), (0, 4)]
+
+        expected_reconstruction = [
+            4,
+            3,
+            (3, 10, 16),
+            0,
+            0,
+            0,
+            (0, 7),
+            (0, 2),
+            (0, 4),
+        ]
+        ret_vec = data_utilities.get_lca_characters(
+            [s1, s2], missing_state_indicator=-1
+        )
+        self.assertEqual(ret_vec, expected_reconstruction)
+
     def test_lca_characters_ambiguous_and_missing(self):
         vecs = [
             [(1, 1), (0, 2), (3, 0), (4,), (5,)],
@@ -325,7 +356,7 @@ def test_lca_characters_ambiguous_and_missing(self):
         ret_vec = data_utilities.get_lca_characters(
             vecs, missing_state_indicator=-1
         )
-        self.assertEqual(ret_vec, [1, (0,2), (3,0), 0, 5])
+        self.assertEqual(ret_vec, [1, (0, 2), (3, 0), 0, 5])
 
     def test_resolve_most_abundant(self):
         state = (1, 2, 3, 3)
@@ -452,8 +483,10 @@ def test_inter_cluster_distance_basic(self):
 
         tree = CassiopeiaTree(tree=tree, cell_meta=meta_data)
 
-        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
-            tree, meta_item="CellType"
+        inter_cluster_distances = (
+            data_utilities.compute_inter_cluster_distances(
+                tree, meta_item="CellType"
+            )
         )
 
         expected_distances = pd.DataFrame.from_dict(
@@ -507,10 +540,12 @@ def test_inter_cluster_distance_custom_input(self):
 
         tree = CassiopeiaTree(tree=tree)
 
-        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
-            tree,
-            meta_data=meta_data["CellType"],
-            dissimilarity_map=weight_matrix,
+        inter_cluster_distances = (
+            data_utilities.compute_inter_cluster_distances(
+                tree,
+                meta_data=meta_data["CellType"],
+                dissimilarity_map=weight_matrix,
+            )
         )
 
         expected_distances = pd.DataFrame.from_dict(