aristoteleo · Ukyeon · Feb 25, 2023 · Feb 25, 2023 · Feb 27, 2023 · May 4, 2023
diff --git a/dynamo/configuration.py b/dynamo/configuration.py
@@ -36,6 +36,7 @@ class DynamoAdataKeyManager:
     # special key names frequently used in dynamo
     X_LAYER = "X"
     PROTEIN_LAYER = "protein"
+    X_PCA = "X_pca"
 
     def gen_new_layer_key(layer_name, key, sep="_") -> str:
         """utility function for returning a new key name for a specific layer. By convention layer_name should not have the separator as the last character."""
@@ -106,7 +107,7 @@ def check_if_layer_exist(adata: AnnData, layer: str) -> bool:
     def get_available_layer_keys(adata, layers="all", remove_pp_layers=True, include_protein=True):
         """Get the list of available layers' keys. If `layers` is set to all, return a list of all available layers; if `layers` is set to a list, then the intersetion of available layers and `layers` will be returned."""
         layer_keys = list(adata.layers.keys())
-        if layers is None: # layers=adata.uns["pp"]["experiment_layers"], in calc_sz_factor
+        if layers is None:  # layers=adata.uns["pp"]["experiment_layers"], in calc_sz_factor
             layers = "X"
         if remove_pp_layers:
             layer_keys = [i for i in layer_keys if not i.startswith("X_")]
@@ -143,10 +144,7 @@ def allowed_X_layer_names():
     def init_uns_pp_namespace(adata: AnnData):
         adata.uns[DynamoAdataKeyManager.UNS_PP_KEY] = {}
 
-    def get_excluded_layers(
-        X_total_layers: bool = False,
-        splicing_total_layers: bool = False
-    ) -> List:
+    def get_excluded_layers(X_total_layers: bool = False, splicing_total_layers: bool = False) -> List:
         """Get a list of excluded layers based on the provided arguments.
 
         When splicing_total_layers is False, the function normalize spliced and unspliced RNA separately using each
@@ -199,6 +197,7 @@ def aggregate_layers_into_total(
                 layers.extend(["_total_"])
         return total_layers, layers
 
+
 # TODO discuss alias naming convention
 DKM = DynamoAdataKeyManager
 

diff --git a/dynamo/plot/networks.py b/dynamo/plot/networks.py
@@ -8,9 +8,9 @@
 import networkx as nx
 import numpy as np
 import pandas as pd
+
 from anndata import AnnData
 from matplotlib.axes import Axes
-
 from ..tools.utils import flatten, index_gene, update_dict
 from .utils import save_fig, set_colorbar
 from .utils_graph import ArcPlot

diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py
@@ -118,7 +118,6 @@ def basic_stats(
             s_kwargs["close"] = False
         save_fig(**s_kwargs)
     if save_show_or_return in ["show", "both", "all"]:
-        import matplotlib.pyplot as plt
         plt.tight_layout()
         plt.show()
     if save_show_or_return in ["return", "all"]:
@@ -435,7 +434,6 @@ def biplot(
     figsize: Tuple[float, float] = (6, 4),
     scale_pca_embedding: bool = False,
     draw_pca_embedding: bool = False,
-
     save_show_or_return: Literal["save", "show", "return"] = "show",
     save_kwargs: Dict[str, Any] = {},
     ax: Optional[Axes] = None,

diff --git a/dynamo/plot/scatters.py b/dynamo/plot/scatters.py
@@ -795,7 +795,8 @@ def _plot_basis_layer(cur_b, cur_l):
                     if deaxis:
                         deaxis_all(ax)
 
-                ax.set_title(cur_title)
+                ax.set_title(cur_title, color=font_color)
+                ax.tick_params(axis="both", colors=font_color)
 
                 axes_list.append(ax)
                 color_list.append(color_out)

diff --git a/dynamo/prediction/fate.py b/dynamo/prediction/fate.py
@@ -9,12 +9,14 @@
 from sklearn.neighbors import NearestNeighbors
 from tqdm import tqdm
 
+from ..configuration import DKM
 from ..dynamo_logger import (
     LoggerManager,
     main_info,
     main_info_insert_adata,
     main_warning,
 )
+from ..tools.connectivity import get_mapper_umap
 from ..tools.utils import fetch_states, getTseq
 from ..vectorfield import vector_field_function
 from ..vectorfield.utils import vecfld_from_adata, vector_transformation
@@ -163,17 +165,33 @@ def fate(
         # this requires umap 0.4; reverse project to PCA space.
         if prediction.ndim == 1:
             prediction = prediction[None, :]
-        exprs = adata.uns["umap_fit"]["fit"].inverse_transform(prediction)
+
+        params = adata.uns["umap"]
+        mapper = get_mapper_umap(
+            params["X_data"],
+            n_components=params["umap_kwargs"]["n_components"],
+            metric=params["umap_kwargs"]["metric"],
+            min_dist=params["umap_kwargs"]["min_dist"],
+            spread=params["umap_kwargs"]["spread"],
+            max_iter=params["umap_kwargs"]["max_iter"],
+            alpha=params["umap_kwargs"]["alpha"],
+            gamma=params["umap_kwargs"]["gamma"],
+            negative_sample_rate=params["umap_kwargs"]["negative_sample_rate"],
+            init_pos=params["umap_kwargs"]["init_pos"],
+            random_state=params["umap_kwargs"]["random_state"],
+            umap_kwargs=params["umap_kwargs"],
+        )
+        exprs = mapper.inverse_transform(prediction)
 
         # further reverse project back to raw expression space
         PCs = adata.uns["PCs"].T
         if PCs.shape[0] == exprs.shape[1]:
             exprs = np.expm1(exprs @ PCs + adata.uns["pca_mean"])
 
-        ndim = adata.uns["umap_fit"]["fit"]._raw_data.shape[1]
+        ndim = mapper._raw_data.shape[1]
 
         if "X" in adata.obsm_keys():
-            if ndim == adata.obsm["X"].shape[1]:  # lift the dimension up again
+            if ndim == adata.obsm[DKM.X_PCA].shape[1]:  # lift the dimension up again
                 exprs = adata.uns["pca_fit"].inverse_transform(prediction)
 
         if adata.var.use_for_dynamics.sum() == exprs.shape[1]:
@@ -211,7 +229,7 @@ def _fate(
     interpolation_num: int = 250,
     average: bool = True,
     sampling: str = "arc_length",
-    cores:int = 1,
+    cores: int = 1,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """Predict the historical and future cell transcriptomic states over arbitrary time scales by integrating vector
     field functions from one or a set of initial cell state(s).

diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py
@@ -16,14 +16,14 @@
     main_info_insert_adata,
     main_warning,
 )
+from ..tools.connectivity import neighbors as default_neighbors
+from ..tools.utils import update_dict
+from .cell_cycle import cell_cycle_scores
 from .external import (
     normalize_layers_pearson_residuals,
     sctransform,
     select_genes_by_pearson_residuals,
 )
-from ..tools.connectivity import neighbors as default_neighbors
-from ..tools.utils import update_dict
-from .cell_cycle import cell_cycle_scores
 from .gene_selection import select_genes_by_seurat_recipe, select_genes_monocle
 from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca
 from .preprocessor_utils import (
@@ -115,7 +115,7 @@ def __init__(
         self.basic_stats = basic_stats
         self.convert_layers2csr = convert_layers2csr
         self.unique_var_obs_adata = unique_var_obs_adata
-        self.norm_method = log1p
+        self.norm_method = norm_method
         self.norm_method_kwargs = norm_method_kwargs
         self.sctransform = sctransform
 
@@ -130,7 +130,6 @@ def __init__(
         self.pca_kwargs = pca_kwargs
         self.cell_cycle_score = cell_cycle_scores
 
-        # self.n_top_genes = n_top_genes
         self.convert_gene_name = convert_gene_name_function
         self.collapse_species_adata = collapse_species_adata_function
         self.gene_append_list = gene_append_list
@@ -443,7 +442,7 @@ def preprocess_adata_seurat_wo_pca(
 
         temp_logger.finish_progress(progress_name="preprocess by seurat wo pca recipe")
 
-    def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None:
+    def config_monocle_recipe(self, adata: AnnData) -> None:
         """Automatically configure the preprocessor for monocle recipe.
 
         Args:
@@ -483,15 +482,15 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None
             "shared_count": 30,
         }
         self.select_genes = select_genes_monocle
-        self.select_genes_kwargs = {"n_top_genes": n_top_genes, "SVRs_kwargs": {"relative_expr": False}}
+        self.select_genes_kwargs.update({"SVRs_kwargs": {"relative_expr": False}})
         self.normalize_selected_genes = None
         self.normalize_by_cells = normalize
         self.norm_method = log1p
 
-        self.regress_out_kwargs = update_dict({"obs_keys": []}, self.regress_out_kwargs)
+        self.regress_out_kwargs.update({"obs_keys": []})
 
         self.pca = pca
-        self.pca_kwargs = {"pca_key": "X_pca"}
+        self.pca_kwargs.update({"pca_key": "X_pca"})
 
         self.cell_cycle_score_kwargs = {
             "layer": None,

diff --git a/dynamo/preprocessing/cell_cycle.py b/dynamo/preprocessing/cell_cycle.py
@@ -418,7 +418,7 @@ def get_cell_phase(
     else:
         cell_phase_genes = gene_list
 
-    adata.uns["cell_phase_genes"] = cell_phase_genes
+    adata.uns["cell_phase_genes"] = dict(cell_phase_genes)  # OrderedDict is not hdf5 serializable, so convert to dict.
     # score each cell cycle phase and Z-normalize
     phase_scores = pd.DataFrame(batch_group_score(adata, layer, cell_phase_genes))
     normalized_phase_scores = phase_scores.sub(phase_scores.mean(axis=1), axis=0).div(phase_scores.std(axis=1), axis=0)

diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py
@@ -1147,8 +1147,6 @@ def recipe_monocle(
 
     if method == "pca":
         adata = pca(adata, pca_input, num_dim, "X_" + method.lower())
-        # TODO remove adata.obsm["X"] in future, use adata.obsm.X_pca instead
-        adata.obsm["X"] = adata.obsm["X_" + method.lower()]
 
     elif method == "ica":
         fit = FastICA(
@@ -1159,9 +1157,7 @@ def recipe_monocle(
             max_iter=1000,
         )
         reduce_dim = fit.fit_transform(pca_input.toarray())
-
         adata.obsm["X_" + method.lower()] = reduce_dim
-        adata.obsm["X"] = adata.obsm["X_" + method.lower()]
 
     logger.info_insert_adata(method + "_fit", "uns")
     adata.uns[method + "_fit"], adata.uns["feature_selection"] = (

diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py
@@ -137,8 +137,10 @@ def convert2symbol(adata: AnnData, scopes: Union[str, Iterable, None] = None, su
         merge_df = adata.var.merge(official_gene_df, left_on="query", right_on="query", how="left").set_index(
             adata.var.index
         )
-        adata.var = merge_df
+
         valid_ind = np.where(merge_df["notfound"] != True)[0]  # noqa: E712
+        merge_df.pop("notfound")
+        adata.var = merge_df
 
         if subset is True:
             adata._inplace_subset_var(valid_ind)
@@ -535,13 +537,13 @@ def get_gene_selection_filter(
 ) -> np.ndarray:
     """Generate the mask by sorting given table of scores.
 
-        Args:
-            valid_table: the scores used to sort the highly variable genes.
-            n_top_genes: number of top genes to be filtered. Defaults to 2000.
-            basic_filter: the filter to remove outliers. For example, the `adata.var["pass_basic_filter"]`.
+    Args:
+        valid_table: the scores used to sort the highly variable genes.
+        n_top_genes: number of top genes to be filtered. Defaults to 2000.
+        basic_filter: the filter to remove outliers. For example, the `adata.var["pass_basic_filter"]`.
 
-        Returns:
-            The filter mask as a bool array.
+    Returns:
+        The filter mask as a bool array.
     """
     if basic_filter is None:
         basic_filter = pd.Series(True, index=valid_table.index)

diff --git a/dynamo/tools/cell_velocities.py b/dynamo/tools/cell_velocities.py
@@ -13,7 +13,9 @@
 from sklearn.decomposition import PCA
 from sklearn.utils import sparsefuncs
 
+from ..configuration import DKM
 from ..dynamo_logger import LoggerManager, main_info, main_warning
+from ..tools.connectivity import get_mapper_umap
 from ..utils import areinstance
 from .connectivity import _gen_neighbor_keys, adj_to_knn, check_and_recompute_neighbors
 from .dimension_reduction import reduceDimension
@@ -319,10 +321,10 @@ def cell_velocities(
     V = V.A if sp.issparse(V) else V
     X = X.A if sp.issparse(X) else X
     finite_inds = get_finite_inds(V)
-    X, V = X[:, finite_inds], V[:, finite_inds]
 
-    if sum(finite_inds) != X.shape[0]:
+    if sum(finite_inds) != X.shape[1]:
         main_info(f"{X.shape[1] - sum(finite_inds)} genes are removed because of nan velocity values.")
+        X, V = X[:, finite_inds], V[:, finite_inds]
         if transition_genes is not None:  # if X, V is provided by the user, transition_genes will be None
             adata.var.loc[np.array(transition_genes)[~finite_inds], "use_for_transition"] = False
 
@@ -518,28 +520,47 @@ def cell_velocities(
             adata.obsp["discrete_vector_field"] = E
 
     elif method == "transform":
-        umap_trans, n_pca_components = (
-            adata.uns["umap_fit"]["fit"],
-            adata.uns["umap_fit"]["n_pca_components"],
+        if "X_data" not in adata.uns["umap"].keys():
+            raise Exception(
+                f"Parameter info of UMAP is missing in the provided anndata object. "
+                "Run `dyn.tl.reduceDimension` before doing this."
+            )
+
+        params = adata.uns["umap"]
+        mapper = get_mapper_umap(
+            params["X_data"],
+            n_components=params["umap_kwargs"]["n_components"],
+            metric=params["umap_kwargs"]["metric"],
+            min_dist=params["umap_kwargs"]["min_dist"],
+            spread=params["umap_kwargs"]["spread"],
+            max_iter=params["umap_kwargs"]["max_iter"],
+            alpha=params["umap_kwargs"]["alpha"],
+            gamma=params["umap_kwargs"]["gamma"],
+            negative_sample_rate=params["umap_kwargs"]["negative_sample_rate"],
+            init_pos=params["umap_kwargs"]["init_pos"],
+            random_state=params["umap_kwargs"]["random_state"],
+            umap_kwargs=params["umap_kwargs"],
         )
 
         if "pca_fit" not in adata.uns_keys() or type(adata.uns["pca_fit"]) == str:
             CM = adata.X[:, adata.var.use_for_dynamics.values]
             from ..preprocessing.utils import pca
 
-            adata, pca_fit, X_pca = pca(adata, CM, n_pca_components, "X", return_all=True)
+            adata, pca_fit, X_pca = pca(adata, CM, params["n_pca_components"], "X", return_all=True)
             adata.uns["pca_fit"] = pca_fit
 
-        X_pca, pca_fit = adata.obsm["X"], adata.uns["pca_fit"]
+        X_pca, pca_fit = adata.obsm[DKM.X_PCA], adata.uns["pca_fit"]
         V = adata[:, adata.var.use_for_dynamics.values].layers[vkey] if vkey in adata.layers.keys() else None
         CM, V = CM.A if sp.issparse(CM) else CM, V.A if sp.issparse(V) else V
         V[np.isnan(V)] = 0
+        V[np.isinf(V)] = 0  ############### NEED TO CHECK!
         Y_pca = pca_fit.transform(CM + V)
 
-        Y = umap_trans.transform(Y_pca)
-
+        Y = mapper.transform(Y_pca)
         delta_X = Y - X_embedding
 
+        T = transition_matrix  ############## NEED TO CHECK!
+
     if method not in ["pearson", "cosine"]:
         X_grid, V_grid, D = velocity_on_grid(X_embedding[:, :2], delta_X[:, :2], xy_grid_nums=xy_grid_nums)
         if calc_rnd_vel: