Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix error regarding saving file as .h5ad with adata.write #455

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions dynamo/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class DynamoAdataKeyManager:
# special key names frequently used in dynamo
X_LAYER = "X"
PROTEIN_LAYER = "protein"
X_PCA = "X_pca"

def gen_new_layer_key(layer_name, key, sep="_") -> str:
"""utility function for returning a new key name for a specific layer. By convention layer_name should not have the separator as the last character."""
Expand Down Expand Up @@ -106,7 +107,7 @@ def check_if_layer_exist(adata: AnnData, layer: str) -> bool:
def get_available_layer_keys(adata, layers="all", remove_pp_layers=True, include_protein=True):
"""Get the list of available layers' keys. If `layers` is set to all, return a list of all available layers; if `layers` is set to a list, then the intersetion of available layers and `layers` will be returned."""
layer_keys = list(adata.layers.keys())
if layers is None: # layers=adata.uns["pp"]["experiment_layers"], in calc_sz_factor
if layers is None: # layers=adata.uns["pp"]["experiment_layers"], in calc_sz_factor
layers = "X"
if remove_pp_layers:
layer_keys = [i for i in layer_keys if not i.startswith("X_")]
Expand Down Expand Up @@ -143,10 +144,7 @@ def allowed_X_layer_names():
def init_uns_pp_namespace(adata: AnnData):
adata.uns[DynamoAdataKeyManager.UNS_PP_KEY] = {}

def get_excluded_layers(
X_total_layers: bool = False,
splicing_total_layers: bool = False
) -> List:
def get_excluded_layers(X_total_layers: bool = False, splicing_total_layers: bool = False) -> List:
"""Get a list of excluded layers based on the provided arguments.

When splicing_total_layers is False, the function normalize spliced and unspliced RNA separately using each
Expand Down Expand Up @@ -199,6 +197,7 @@ def aggregate_layers_into_total(
layers.extend(["_total_"])
return total_layers, layers


# TODO discuss alias naming convention
DKM = DynamoAdataKeyManager

Expand Down
2 changes: 1 addition & 1 deletion dynamo/plot/networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import networkx as nx
import numpy as np
import pandas as pd

from anndata import AnnData
from matplotlib.axes import Axes

from ..tools.utils import flatten, index_gene, update_dict
from .utils import save_fig, set_colorbar
from .utils_graph import ArcPlot
Expand Down
2 changes: 0 additions & 2 deletions dynamo/plot/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def basic_stats(
s_kwargs["close"] = False
save_fig(**s_kwargs)
if save_show_or_return in ["show", "both", "all"]:
import matplotlib.pyplot as plt
plt.tight_layout()
plt.show()
if save_show_or_return in ["return", "all"]:
Expand Down Expand Up @@ -435,7 +434,6 @@ def biplot(
figsize: Tuple[float, float] = (6, 4),
scale_pca_embedding: bool = False,
draw_pca_embedding: bool = False,

save_show_or_return: Literal["save", "show", "return"] = "show",
save_kwargs: Dict[str, Any] = {},
ax: Optional[Axes] = None,
Expand Down
3 changes: 2 additions & 1 deletion dynamo/plot/scatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,8 @@ def _plot_basis_layer(cur_b, cur_l):
if deaxis:
deaxis_all(ax)

ax.set_title(cur_title)
ax.set_title(cur_title, color=font_color)
ax.tick_params(axis="both", colors=font_color)

axes_list.append(ax)
color_list.append(color_out)
Expand Down
26 changes: 22 additions & 4 deletions dynamo/prediction/fate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

from ..configuration import DKM
from ..dynamo_logger import (
LoggerManager,
main_info,
main_info_insert_adata,
main_warning,
)
from ..tools.connectivity import get_mapper_umap
from ..tools.utils import fetch_states, getTseq
from ..vectorfield import vector_field_function
from ..vectorfield.utils import vecfld_from_adata, vector_transformation
Expand Down Expand Up @@ -163,17 +165,33 @@ def fate(
# this requires umap 0.4; reverse project to PCA space.
if prediction.ndim == 1:
prediction = prediction[None, :]
exprs = adata.uns["umap_fit"]["fit"].inverse_transform(prediction)

params = adata.uns["umap"]
mapper = get_mapper_umap(
params["X_data"],
n_components=params["umap_kwargs"]["n_components"],
metric=params["umap_kwargs"]["metric"],
min_dist=params["umap_kwargs"]["min_dist"],
spread=params["umap_kwargs"]["spread"],
max_iter=params["umap_kwargs"]["max_iter"],
alpha=params["umap_kwargs"]["alpha"],
gamma=params["umap_kwargs"]["gamma"],
negative_sample_rate=params["umap_kwargs"]["negative_sample_rate"],
init_pos=params["umap_kwargs"]["init_pos"],
random_state=params["umap_kwargs"]["random_state"],
umap_kwargs=params["umap_kwargs"],
)
exprs = mapper.inverse_transform(prediction)

# further reverse project back to raw expression space
PCs = adata.uns["PCs"].T
if PCs.shape[0] == exprs.shape[1]:
exprs = np.expm1(exprs @ PCs + adata.uns["pca_mean"])

ndim = adata.uns["umap_fit"]["fit"]._raw_data.shape[1]
ndim = mapper._raw_data.shape[1]

if "X" in adata.obsm_keys():
if ndim == adata.obsm["X"].shape[1]: # lift the dimension up again
if ndim == adata.obsm[DKM.X_PCA].shape[1]: # lift the dimension up again
exprs = adata.uns["pca_fit"].inverse_transform(prediction)

if adata.var.use_for_dynamics.sum() == exprs.shape[1]:
Expand Down Expand Up @@ -211,7 +229,7 @@ def _fate(
interpolation_num: int = 250,
average: bool = True,
sampling: str = "arc_length",
cores:int = 1,
cores: int = 1,
) -> Tuple[np.ndarray, np.ndarray]:
"""Predict the historical and future cell transcriptomic states over arbitrary time scales by integrating vector
field functions from one or a set of initial cell state(s).
Expand Down
17 changes: 8 additions & 9 deletions dynamo/preprocessing/Preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
main_info_insert_adata,
main_warning,
)
from ..tools.connectivity import neighbors as default_neighbors
from ..tools.utils import update_dict
from .cell_cycle import cell_cycle_scores
from .external import (
normalize_layers_pearson_residuals,
sctransform,
select_genes_by_pearson_residuals,
)
from ..tools.connectivity import neighbors as default_neighbors
from ..tools.utils import update_dict
from .cell_cycle import cell_cycle_scores
from .gene_selection import select_genes_by_seurat_recipe, select_genes_monocle
from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca
from .preprocessor_utils import (
Expand Down Expand Up @@ -115,7 +115,7 @@ def __init__(
self.basic_stats = basic_stats
self.convert_layers2csr = convert_layers2csr
self.unique_var_obs_adata = unique_var_obs_adata
self.norm_method = log1p
self.norm_method = norm_method
self.norm_method_kwargs = norm_method_kwargs
self.sctransform = sctransform

Expand All @@ -130,7 +130,6 @@ def __init__(
self.pca_kwargs = pca_kwargs
self.cell_cycle_score = cell_cycle_scores

# self.n_top_genes = n_top_genes
self.convert_gene_name = convert_gene_name_function
self.collapse_species_adata = collapse_species_adata_function
self.gene_append_list = gene_append_list
Expand Down Expand Up @@ -443,7 +442,7 @@ def preprocess_adata_seurat_wo_pca(

temp_logger.finish_progress(progress_name="preprocess by seurat wo pca recipe")

def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None:
def config_monocle_recipe(self, adata: AnnData) -> None:
"""Automatically configure the preprocessor for monocle recipe.

Args:
Expand Down Expand Up @@ -483,15 +482,15 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None
"shared_count": 30,
}
self.select_genes = select_genes_monocle
self.select_genes_kwargs = {"n_top_genes": n_top_genes, "SVRs_kwargs": {"relative_expr": False}}
self.select_genes_kwargs.update({"SVRs_kwargs": {"relative_expr": False}})
self.normalize_selected_genes = None
self.normalize_by_cells = normalize
self.norm_method = log1p

self.regress_out_kwargs = update_dict({"obs_keys": []}, self.regress_out_kwargs)
self.regress_out_kwargs.update({"obs_keys": []})

self.pca = pca
self.pca_kwargs = {"pca_key": "X_pca"}
self.pca_kwargs.update({"pca_key": "X_pca"})

self.cell_cycle_score_kwargs = {
"layer": None,
Expand Down
2 changes: 1 addition & 1 deletion dynamo/preprocessing/cell_cycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def get_cell_phase(
else:
cell_phase_genes = gene_list

adata.uns["cell_phase_genes"] = cell_phase_genes
adata.uns["cell_phase_genes"] = dict(cell_phase_genes) # OrderedDict is not hdf5 serializable, so convert to dict.
# score each cell cycle phase and Z-normalize
phase_scores = pd.DataFrame(batch_group_score(adata, layer, cell_phase_genes))
normalized_phase_scores = phase_scores.sub(phase_scores.mean(axis=1), axis=0).div(phase_scores.std(axis=1), axis=0)
Expand Down
4 changes: 0 additions & 4 deletions dynamo/preprocessing/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -1147,8 +1147,6 @@ def recipe_monocle(

if method == "pca":
adata = pca(adata, pca_input, num_dim, "X_" + method.lower())
# TODO remove adata.obsm["X"] in future, use adata.obsm.X_pca instead
adata.obsm["X"] = adata.obsm["X_" + method.lower()]

elif method == "ica":
fit = FastICA(
Expand All @@ -1159,9 +1157,7 @@ def recipe_monocle(
max_iter=1000,
)
reduce_dim = fit.fit_transform(pca_input.toarray())

adata.obsm["X_" + method.lower()] = reduce_dim
adata.obsm["X"] = adata.obsm["X_" + method.lower()]

logger.info_insert_adata(method + "_fit", "uns")
adata.uns[method + "_fit"], adata.uns["feature_selection"] = (
Expand Down
16 changes: 9 additions & 7 deletions dynamo/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,10 @@ def convert2symbol(adata: AnnData, scopes: Union[str, Iterable, None] = None, su
merge_df = adata.var.merge(official_gene_df, left_on="query", right_on="query", how="left").set_index(
adata.var.index
)
adata.var = merge_df

valid_ind = np.where(merge_df["notfound"] != True)[0] # noqa: E712
merge_df.pop("notfound")
adata.var = merge_df

if subset is True:
adata._inplace_subset_var(valid_ind)
Expand Down Expand Up @@ -535,13 +537,13 @@ def get_gene_selection_filter(
) -> np.ndarray:
"""Generate the mask by sorting given table of scores.

Args:
valid_table: the scores used to sort the highly variable genes.
n_top_genes: number of top genes to be filtered. Defaults to 2000.
basic_filter: the filter to remove outliers. For example, the `adata.var["pass_basic_filter"]`.
Args:
valid_table: the scores used to sort the highly variable genes.
n_top_genes: number of top genes to be filtered. Defaults to 2000.
basic_filter: the filter to remove outliers. For example, the `adata.var["pass_basic_filter"]`.

Returns:
The filter mask as a bool array.
Returns:
The filter mask as a bool array.
"""
if basic_filter is None:
basic_filter = pd.Series(True, index=valid_table.index)
Expand Down
39 changes: 30 additions & 9 deletions dynamo/tools/cell_velocities.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from sklearn.decomposition import PCA
from sklearn.utils import sparsefuncs

from ..configuration import DKM
from ..dynamo_logger import LoggerManager, main_info, main_warning
from ..tools.connectivity import get_mapper_umap
from ..utils import areinstance
from .connectivity import _gen_neighbor_keys, adj_to_knn, check_and_recompute_neighbors
from .dimension_reduction import reduceDimension
Expand Down Expand Up @@ -319,10 +321,10 @@ def cell_velocities(
V = V.A if sp.issparse(V) else V
X = X.A if sp.issparse(X) else X
finite_inds = get_finite_inds(V)
X, V = X[:, finite_inds], V[:, finite_inds]

if sum(finite_inds) != X.shape[0]:
if sum(finite_inds) != X.shape[1]:
main_info(f"{X.shape[1] - sum(finite_inds)} genes are removed because of nan velocity values.")
X, V = X[:, finite_inds], V[:, finite_inds]
if transition_genes is not None: # if X, V is provided by the user, transition_genes will be None
adata.var.loc[np.array(transition_genes)[~finite_inds], "use_for_transition"] = False

Expand Down Expand Up @@ -518,28 +520,47 @@ def cell_velocities(
adata.obsp["discrete_vector_field"] = E

elif method == "transform":
umap_trans, n_pca_components = (
adata.uns["umap_fit"]["fit"],
adata.uns["umap_fit"]["n_pca_components"],
if "X_data" not in adata.uns["umap"].keys():
raise Exception(
f"Parameter info of UMAP is missing in the provided anndata object. "
"Run `dyn.tl.reduceDimension` before doing this."
)

params = adata.uns["umap"]
mapper = get_mapper_umap(
params["X_data"],
n_components=params["umap_kwargs"]["n_components"],
metric=params["umap_kwargs"]["metric"],
min_dist=params["umap_kwargs"]["min_dist"],
spread=params["umap_kwargs"]["spread"],
max_iter=params["umap_kwargs"]["max_iter"],
alpha=params["umap_kwargs"]["alpha"],
gamma=params["umap_kwargs"]["gamma"],
negative_sample_rate=params["umap_kwargs"]["negative_sample_rate"],
init_pos=params["umap_kwargs"]["init_pos"],
random_state=params["umap_kwargs"]["random_state"],
umap_kwargs=params["umap_kwargs"],
)

if "pca_fit" not in adata.uns_keys() or type(adata.uns["pca_fit"]) == str:
CM = adata.X[:, adata.var.use_for_dynamics.values]
from ..preprocessing.utils import pca

adata, pca_fit, X_pca = pca(adata, CM, n_pca_components, "X", return_all=True)
adata, pca_fit, X_pca = pca(adata, CM, params["n_pca_components"], "X", return_all=True)
adata.uns["pca_fit"] = pca_fit

X_pca, pca_fit = adata.obsm["X"], adata.uns["pca_fit"]
X_pca, pca_fit = adata.obsm[DKM.X_PCA], adata.uns["pca_fit"]
V = adata[:, adata.var.use_for_dynamics.values].layers[vkey] if vkey in adata.layers.keys() else None
CM, V = CM.A if sp.issparse(CM) else CM, V.A if sp.issparse(V) else V
V[np.isnan(V)] = 0
V[np.isinf(V)] = 0 ############### NEED TO CHECK!
Y_pca = pca_fit.transform(CM + V)

Y = umap_trans.transform(Y_pca)

Y = mapper.transform(Y_pca)
delta_X = Y - X_embedding

T = transition_matrix ############## NEED TO CHECK!

if method not in ["pearson", "cosine"]:
X_grid, V_grid, D = velocity_on_grid(X_embedding[:, :2], delta_X[:, :2], xy_grid_nums=xy_grid_nums)
if calc_rnd_vel:
Expand Down
Loading