From b0bc71d4a3acae077627f93456edf046c7458f2e Mon Sep 17 00:00:00 2001 From: Severin Dicks <37635888+Intron7@users.noreply.github.com> Date: Fri, 4 Aug 2023 14:20:37 +0200 Subject: [PATCH] adds Sparse PCA for rapids-singlecell (#36) * adds ruff & Sparse PCA * ruff updates * updates PCA_sparse * adds test for sparse PCA * fixes plotting api * fixes plotting * version bump --- .pre-commit-config.yaml | 5 + docs/conf.py | 10 +- pyproject.toml | 33 +++ rapids_singlecell/__init__.py | 9 +- rapids_singlecell/cunnData/__init__.py | 51 ++-- rapids_singlecell/cunnData_funcs/__init__.py | 15 +- rapids_singlecell/cunnData_funcs/_hvg.py | 58 ++-- .../cunnData_funcs/_normalize.py | 17 +- rapids_singlecell/cunnData_funcs/_pca.py | 252 +++++++++++++++++- rapids_singlecell/cunnData_funcs/_plotting.py | 21 +- .../cunnData_funcs/_regress_out.py | 14 +- rapids_singlecell/cunnData_funcs/_scale.py | 6 +- rapids_singlecell/cunnData_funcs/_simple.py | 28 +- rapids_singlecell/cunnData_funcs/_utils.py | 3 +- .../decoupler_gpu/_method_mlm.py | 17 +- .../decoupler_gpu/_method_wsum.py | 15 +- rapids_singlecell/pl.py | 3 +- rapids_singlecell/scanpy_gpu/__init__.py | 5 +- rapids_singlecell/scanpy_gpu/_clustering.py | 25 +- rapids_singlecell/scanpy_gpu/_diffmap.py | 11 +- rapids_singlecell/scanpy_gpu/_draw_graph.py | 12 +- .../scanpy_gpu/_embedding_density.py | 10 +- rapids_singlecell/scanpy_gpu/_harmonpy_gpu.py | 13 +- rapids_singlecell/scanpy_gpu/_pymde.py | 25 +- .../scanpy_gpu/_rank_gene_groups.py | 34 +-- rapids_singlecell/scanpy_gpu/_tsne.py | 5 +- rapids_singlecell/squidpy_gpu/_autocorr.py | 18 +- rapids_singlecell/squidpy_gpu/_gearysc.py | 4 +- rapids_singlecell/squidpy_gpu/_ligrec.py | 27 +- rapids_singlecell/squidpy_gpu/_moransi.py | 3 +- rapids_singlecell/squidpy_gpu/_utils.py | 9 +- rapids_singlecell/tests/test_autocorr.py | 8 +- rapids_singlecell/tests/test_cunndata.py | 20 +- rapids_singlecell/tests/test_decoupler.py | 6 +- .../tests/test_emdedding_density.py | 2 +- rapids_singlecell/tests/test_hvg.py | 12 +- rapids_singlecell/tests/test_ligrec.py | 14 +- rapids_singlecell/tests/test_normalization.py | 6 +- rapids_singlecell/tests/test_pca.py | 30 ++- rapids_singlecell/tests/test_preprocessing.py | 4 +- rapids_singlecell/tests/test_qc_metrics.py | 13 +- .../tests/test_rank_genes_groups_logreg.py | 2 +- 42 files changed, 582 insertions(+), 293 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 957b3d4f..5d50f16b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,3 +9,8 @@ repos: rev: 23.7.0 hooks: - id: black +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.0.282 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] diff --git a/docs/conf.py b/docs/conf.py index 748dd9b0..b9030243 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -121,11 +121,11 @@ # html_theme = "scanpydoc" -html_theme_options = dict( - repository_url=repository_url, - repository_branch=os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main"), - use_repository_button=True, -) +html_theme_options = { + "repository_url": repository_url, + "repository_branch": os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main"), + "use_repository_button": True, +} html_show_sphinx = False html_logo = "_static/logo3.svg" diff --git a/pyproject.toml b/pyproject.toml index 4df9a396..2e91727d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,39 @@ line-length = 88 target-version = ['py38'] include = '^rapids_singlecell/.*\.py$' +[tool.ruff] +src = ["rapids_singlecell"] +exclude = ["rapids_singlecell/tests"] +line-length = 88 +select = [ + "F", # Errors detected by Pyflakes + "E", # Error detected by Pycodestyle + "W", # Warning detected by Pycodestyle + "I", # isort + "TID", # flake8-tidy-imports + "C4", # flake8-comprehensions + "BLE", # flake8-blind-except + "UP", # pyupgrade + "RUF100", # Report unused noqa directives +] +ignore = [ + # line too long -> we accept long comment lines; black gets rid of long code lines + "E501", + # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient + "E731", + # allow I, O, l as variable names -> I is the identity matrix + "E741", + # Missing docstring in public package + "F403", + # First line should be in imperative mood; try rephrasing +] + + +[tool.ruff.per-file-ignores] +"docs/*" = ["I"] +"tests/*" = ["D"] +"*/__init__.py" = ["F401"] + [tool.flit.sdist] exclude = [ "rapids_singlecell/tests", diff --git a/rapids_singlecell/__init__.py b/rapids_singlecell/__init__.py index 5db39bc5..9233d7e3 100755 --- a/rapids_singlecell/__init__.py +++ b/rapids_singlecell/__init__.py @@ -1,8 +1,3 @@ -from . import cunnData -from . import pp -from . import dcg -from . import tl -from . import pl -from . import gr +from . import cunnData, dcg, gr, pl, pp, tl -__version__ = "0.7.2" +__version__ = "0.7.5" diff --git a/rapids_singlecell/cunnData/__init__.py b/rapids_singlecell/cunnData/__init__.py index 114e5dad..6579b922 100644 --- a/rapids_singlecell/cunnData/__init__.py +++ b/rapids_singlecell/cunnData/__init__.py @@ -1,34 +1,31 @@ +import warnings +from collections import OrderedDict +from itertools import repeat +from typing import Any, List, Mapping, MutableMapping, Optional, Union + +import anndata import cupy as cp import cupyx as cpx -from anndata import AnnData -from anndata._core.index import _normalize_indices -import anndata - import numpy as np import pandas as pd -from scipy import sparse -from collections import OrderedDict -from typing import Any, Union, Optional, Mapping, MutableMapping, List -from pandas.api.types import infer_dtype, is_string_dtype -from itertools import repeat -import warnings - +from anndata import AnnData +from anndata._core.index import _normalize_indices +from cupyx.scipy.sparse import issparse as issparse_gpu from natsort import natsorted - +from pandas.api.types import infer_dtype, is_string_dtype +from scipy import sparse from scipy.sparse import issparse as issparse_cpu -from cupyx.scipy.sparse import issparse as issparse_gpu class Layer_Mapping(dict): - """ - Dictonary subclass for layers handeling in cunnData - """ + """Dictonary subclass for layers handeling in cunnData""" def __init__(self, shape=None): super().__init__({}) self.shape = shape def update_shape(self, shape): + """Updates Shape for Layers""" self.shape = shape def __setitem__(self, key, item): @@ -56,15 +53,14 @@ def __setitem__(self, key, item): class obsm_Mapping(dict): - """ - Dictonary subclass for obsm handeling in cunnData - """ + """Dictonary subclass for obsm handeling in cunnData""" def __init__(self, shape=None): super().__init__({}) self.shape = shape def update_shape(self, shape): + """Updates Shape for obsm""" self.shape = shape def __setitem__(self, key, item): @@ -75,15 +71,14 @@ def __setitem__(self, key, item): class varm_Mapping(dict): - """ - Dictonary subclass for obsm handeling in cunnData - """ + """Dictonary subclass for obsm handeling in cunnData""" def __init__(self, shape=None): super().__init__({}) self.shape = shape def update_shape(self, shape): + """Updates Shape for varm""" self.shape = shape def __setitem__(self, key, item): @@ -94,10 +89,11 @@ def __setitem__(self, key, item): class cunnData: - """ + """\ The cunnData objects can be used as an AnnData replacement for the inital preprocessing of single cell Datasets. It replaces some of the most common preprocessing steps within scanpy for annData objects. + It can be initalized with a preexisting annData object or with a countmatrix and seperate Dataframes for var and obs. Index of var will be used as gene_names. Initalization with an AnnData object is advised. @@ -310,8 +306,9 @@ def uns(self): @property def layers(self): - """\ + """ Dictionary-like object with values of the same dimensions as :attr:`.X`. + Layers in cunnData are inspired by AnnData. Return the layer named `"unspliced"`:: @@ -329,9 +326,10 @@ def layers(self): @property def obsm(self): - """\ + """ Multi-dimensional annotation of observations (mutable structured :class:`~numpy.ndarray`). + Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` of length :attr:`n_obs`. Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. @@ -343,6 +341,7 @@ def varm(self): """\ Multi-dimensional annotation of variables/features (mutable structured :class:`~numpy.ndarray`). + Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` of length :attr:`n_vars`. Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`. @@ -614,7 +613,7 @@ def varm_keys(self) -> List[str]: def uns_keys(self) -> List[str]: """List keys of unstructured annotation.""" - return sorted(list(self._uns.keys())) + return sorted(self._uns.keys()) def to_AnnData(self): """ diff --git a/rapids_singlecell/cunnData_funcs/__init__.py b/rapids_singlecell/cunnData_funcs/__init__.py index 4a71754f..624ecc36 100644 --- a/rapids_singlecell/cunnData_funcs/__init__.py +++ b/rapids_singlecell/cunnData_funcs/__init__.py @@ -1,7 +1,12 @@ +from ._hvg import highly_variable_genes +from ._normalize import log1p, normalize_pearson_residuals, normalize_total +from ._pca import pca from ._regress_out import regress_out from ._scale import scale -from ._pca import pca -from ._hvg import highly_variable_genes -from ._normalize import normalize_pearson_residuals, log1p, normalize_total -from ._simple import filter_cells, filter_genes, filter_highly_variable -from ._simple import calculate_qc_metrics, flag_gene_family +from ._simple import ( + calculate_qc_metrics, + filter_cells, + filter_genes, + filter_highly_variable, + flag_gene_family, +) diff --git a/rapids_singlecell/cunnData_funcs/_hvg.py b/rapids_singlecell/cunnData_funcs/_hvg.py index e61b8e79..461baa7c 100644 --- a/rapids_singlecell/cunnData_funcs/_hvg.py +++ b/rapids_singlecell/cunnData_funcs/_hvg.py @@ -1,12 +1,13 @@ -import cupy as cp -import cupyx as cpx -import numpy as np -import pandas as pd import math import warnings from typing import Optional -from ..cunnData import cunnData +import cupy as cp +import numpy as np +import pandas as pd + +from rapids_singlecell.cunnData import cunnData + from ._utils import _check_nonnegative_integers, _get_mean_var @@ -203,15 +204,15 @@ def highly_variable_genes( df = pd.concat(df, axis=0) df["highly_variable"] = df["highly_variable"].astype(int) df = df.groupby("gene").agg( - dict( - means=np.nanmean, - dispersions=np.nanmean, - dispersions_norm=np.nanmean, - highly_variable=np.nansum, - ) + { + "means": np.nanmean, + "dispersions": np.nanmean, + "dispersions_norm": np.nanmean, + "highly_variable": np.nansum, + } ) df.rename( - columns=dict(highly_variable="highly_variable_nbatches"), inplace=True + columns={"highly_variable": "highly_variable_nbatches"}, inplace=True ) df["highly_variable_intersection"] = df["highly_variable_nbatches"] == len( batches @@ -269,11 +270,12 @@ def _highly_variable_genes_single_batch( ): """\ See `highly_variable_genes`. - Returns - ------- - A DataFrame that contains the columns - `highly_variable`, `means`, `dispersions`, and `dispersions_norm`. - """ + + Returns + ------- + A DataFrame that contains the columns + `highly_variable`, `means`, `dispersions`, and `dispersions_norm`. + """ if flavor == "seurat": X = X.expm1() mean, var = _get_mean_var(X, axis=1) @@ -297,7 +299,7 @@ def _highly_variable_genes_single_batch( # only a single gene fell in the bin and implicitly set them to have # a normalized disperion of 1 one_gene_per_bin = disp_std_bin.isnull() - gen_indices = np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist() + np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist() # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32, # but there’s still a dtype error without “.value”. @@ -368,6 +370,7 @@ def _highly_variable_genes_seurat_v3( """\ See `highly_variable_genes`. For further implementation details see https://www.overleaf.com/read/ckptrbgzzzpg + Returns ------- updates `.var` with the following fields: @@ -663,14 +666,14 @@ def _highly_variable_pearson_residuals( means, variances = _get_mean_var(X, axis=1) means, variances = means.get(), variances.get() df = pd.DataFrame.from_dict( - dict( - means=means, - variances=variances, - residual_variances=cp.mean(residual_gene_vars, axis=0).get(), - highly_variable_rank=medianrank_residual_var, - highly_variable_nbatches=highly_variable_nbatches.astype(np.int64), - highly_variable_intersection=highly_variable_nbatches == n_batches, - ) + { + "means": means, + "variances": variances, + "residual_variances": cp.mean(residual_gene_vars, axis=0).get(), + "highly_variable_rank": medianrank_residual_var, + "highly_variable_nbatches": highly_variable_nbatches.astype(np.int64), + "highly_variable_intersection": highly_variable_nbatches == n_batches, + } ) df = df.set_index(cudata.var_names) df.sort_values( @@ -715,6 +718,7 @@ def _poisson_gene_selection( The method accounts for library size internally, a raw count matrix should be provided. Instead of Z-test, enrichment of zeros is quantified by posterior probabilites from a binomial model, computed through sampling. + Parameters ---------- cudata @@ -733,6 +737,7 @@ def _poisson_gene_selection( Size of temporary matrix for incremental calculation. Larger is faster but requires more RAM or GPU memory. (The default should be fine unless there are hundreds of millions cells or millions of genes.) + Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or @@ -750,7 +755,6 @@ def _poisson_gene_selection( prob_zero_enriched_nbatches : int If batch_key is given, this denotes in how many batches genes are detected as zero enriched """ - try: import torch except ImportError: diff --git a/rapids_singlecell/cunnData_funcs/_normalize.py b/rapids_singlecell/cunnData_funcs/_normalize.py index 9bc03168..094d0785 100644 --- a/rapids_singlecell/cunnData_funcs/_normalize.py +++ b/rapids_singlecell/cunnData_funcs/_normalize.py @@ -1,10 +1,12 @@ -import cupy as cp -import cupyx as cpx import math import warnings from typing import Optional -from ..cunnData import cunnData +import cupy as cp +import cupyx as cpx + +from rapids_singlecell.cunnData import cunnData + from ._utils import _check_nonnegative_integers @@ -100,7 +102,7 @@ def log1p( Whether to return a copy or update `cudata`. Returns - ---------- + ------- The resulting sparse matrix after applying the natural logarithm of one plus the input matrix. \ If `copy` is set to True, returns the new sparse matrix. Otherwise, updates the `cudata` object \ in-place and returns None. @@ -310,12 +312,11 @@ def normalize_pearson_residuals( If True, update cunnData with results. Otherwise, return results. See below for details of what is returned. Returns - ---------- + ------- If `inplace=True`, `cudata.X` or the selected layer in `cudata.layers` is updated with the normalized values. \ If `inplace=False` the normalized matrix is returned. """ - X = cudata.layers[layer] if layer is not None else cudata.X if check_values and not _check_nonnegative_integers(X): @@ -324,7 +325,7 @@ def normalize_pearson_residuals( UserWarning, ) computed_on = layer if layer else "cudata.X" - settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on) + settings_dict = {"theta": theta, "clip": clip, "computed_on": computed_on} if theta <= 0: raise ValueError("Pearson residuals require theta > 0") if clip is None: @@ -424,7 +425,7 @@ def normalize_pearson_residuals( ), ) - if inplace == True: + if inplace is True: cudata.uns["pearson_residuals_normalization"] = settings_dict if layer: cudata.layers[layer] = residuals diff --git a/rapids_singlecell/cunnData_funcs/_pca.py b/rapids_singlecell/cunnData_funcs/_pca.py index 8cc7561a..895d0ee7 100644 --- a/rapids_singlecell/cunnData_funcs/_pca.py +++ b/rapids_singlecell/cunnData_funcs/_pca.py @@ -1,12 +1,17 @@ -from ..cunnData import cunnData -from anndata import AnnData - -from cuml.decomposition import PCA, TruncatedSVD +import math from typing import Optional, Union -from cupy.sparse import issparse -import math +import cupy as cp import numpy as np +from anndata import AnnData +from cuml.common.kernel_utils import cuda_kernel_factory +from cuml.decomposition import PCA, TruncatedSVD +from cuml.internals.input_utils import sparse_scipy_to_cp +from cupyx.scipy.sparse import csr_matrix, isspmatrix_csr +from cupyx.scipy.sparse import issparse as cpissparse +from scipy.sparse import issparse + +from rapids_singlecell.cunnData import cunnData def pca( @@ -68,7 +73,6 @@ def pca( Explained variance, equivalent to the eigenvalues of the covariance matrix. """ - if use_highly_variable is True and "highly_variable" not in cudata.var.keys(): raise ValueError( "Did not find cudata.var['highly_variable']. " @@ -106,14 +110,22 @@ def pca( start_idx = batch * chunk_size stop_idx = min(batch * chunk_size + chunk_size, X.shape[0]) chunk = X[start_idx:stop_idx, :] - chunk = chunk.toarray() if issparse(chunk) else chunk + if issparse(chunk) or cpissparse(chunk): + chunk = chunk.toarray() X_pca[start_idx:stop_idx] = pca_func.transform(chunk) else: if zero_center: - pca_func = PCA( - n_components=n_comps, random_state=random_state, output_type="numpy" - ) - X_pca = pca_func.fit_transform(X) + if cpissparse(X) or issparse(X): + if issparse(X): + X = sparse_scipy_to_cp(X, dtype=X.dtype) + X = csr_matrix(X) + pca_func = PCA_sparse(n_components=n_comps) + X_pca = pca_func.fit_transform(X) + else: + pca_func = PCA( + n_components=n_comps, random_state=random_state, output_type="numpy" + ) + X_pca = pca_func.fit_transform(X) elif not zero_center: pca_func = TruncatedSVD( @@ -131,3 +143,219 @@ def pca( cudata.varm["PCs"][cudata.var["highly_variable"]] = pca_func.components_.T else: cudata.varm["PCs"] = pca_func.components_.T + + +class PCA_sparse: + def __init__(self, n_components) -> None: + self.n_components = n_components + + def fit(self, x): + if self.n_components is None: + n_rows = x.shape[0] + n_cols = x.shape[1] + self.n_components_ = min(n_rows, n_cols) + else: + self.n_components_ = self.n_components + + if not isspmatrix_csr(x): + x = x.tocsr() + self.n_samples_ = x.shape[0] + self.n_features_in_ = x.shape[1] if x.ndim == 2 else 1 + self.dtype = x.data.dtype + + covariance, self.mean_, _ = _cov_sparse(x=x, return_mean=True) + + self.explained_variance_, self.components_ = cp.linalg.eigh( + covariance, UPLO="U" + ) + + # NOTE: We reverse the eigen vector and eigen values here + # because cupy provides them in ascending order. Make a copy otherwise + # it is not C_CONTIGUOUS anymore and would error when converting to + # CumlArray + self.explained_variance_ = self.explained_variance_[::-1] + + self.components_ = cp.flip(self.components_, axis=1) + + self.components_ = self.components_.T[: self.n_components_, :] + + self.explained_variance_ratio_ = self.explained_variance_ / cp.sum( + self.explained_variance_ + ) + + self.explained_variance_ = self.explained_variance_[: self.n_components_] + + self.explained_variance_ratio_ = self.explained_variance_ratio_[ + : self.n_components_ + ] + + return self + + def transform(self, X): + X = X - self.mean_ + X_transformed = X.dot(self.components_.T) + self.components_ = self.components_.get() + self.explained_variance_ = self.explained_variance_.get() + self.explained_variance_ratio_ = self.explained_variance_ratio_.get() + return X_transformed.get() + + def fit_transform(self, X, y=None): + return self.fit(X).transform(X) + + +cov_kernel_str = r""" +({0} *cov_values, {0} *gram_matrix, {0} *mean_x, {0} *mean_y, int n_cols) { + + int rid = blockDim.x * blockIdx.x + threadIdx.x; + int cid = blockDim.y * blockIdx.y + threadIdx.y; + + if(rid >= n_cols || cid >= n_cols) return; + + cov_values[rid * n_cols + cid] = \ + gram_matrix[rid * n_cols + cid] - mean_x[rid] * mean_y[cid]; +} +""" + +gramm_kernel_csr = r""" +(const int *indptr, const int *index, {0} *data, int nrows, int ncols, {0} *out) { + int row = blockIdx.x; + int col = threadIdx.x; + + if(row >= nrows) return; + + int start = indptr[row]; + int end = indptr[row + 1]; + + for (int idx1 = start; idx1 < end; idx1++){ + int index1 = index[idx1]; + {0} data1 = data[idx1]; + for(int idx2 = idx1 + col; idx2 < end; idx2 += blockDim.x){ + int index2 = index[idx2]; + {0} data2 = data[idx2]; + atomicAdd(&out[index1 * ncols + index2], data1 * data2); + } + } +} +""" + + +copy_kernel = r""" +({0} *out, int ncols) { + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + if (row >= ncols || col >= ncols) return; + + if (row > col) { + out[row * ncols + col] = out[col * ncols + row]; + } +} +""" + + +def _cov_kernel(dtype): + return cuda_kernel_factory(cov_kernel_str, (dtype,), "cov_kernel") + + +def _gramm_kernel_csr(dtype): + return cuda_kernel_factory(gramm_kernel_csr, (dtype,), "gramm_kernel_csr") + + +def _copy_kernel(dtype): + return cuda_kernel_factory(copy_kernel, (dtype,), "copy_kernel") + + +def _cov_sparse(x, return_gram=False, return_mean=False): + """ + Computes the mean and the covariance of matrix X of + the form Cov(X, X) = E(XX) - E(X)E(X) + + This is a temporary fix for + cuml issue #5475 and cupy issue #7699, + where the operation `x.T.dot(x)` did not work for + larger sparse matrices. + + Parameters + ---------- + + x : cupyx.scipy.sparse of size (m, n) + return_gram : boolean (default = False) + If True, gram matrix of the form (1 / n) * X.T.dot(X) + will be returned. + When True, a copy will be created + to store the results of the covariance. + When False, the local gram matrix result + will be overwritten + return_mean: boolean (default = False) + If True, the Maximum Likelihood Estimate used to + calculate the mean of X and X will be returned, + of the form (1 / n) * mean(X) and (1 / n) * mean(X) + + Returns + ------- + + result : cov(X, X) when return_gram and return_mean are False + cov(X, X), gram(X, X) when return_gram is True, + return_mean is False + cov(X, X), mean(X), mean(X) when return_gram is False, + return_mean is True + cov(X, X), gram(X, X), mean(X), mean(X) + when return_gram is True and return_mean is True + """ + + gram_matrix = cp.zeros((x.shape[1], x.shape[1]), dtype=x.data.dtype) + + block = (128,) + grid = (x.shape[0],) + compute_mean_cov = _gramm_kernel_csr(x.data.dtype) + compute_mean_cov( + grid, + block, + ( + x.indptr, + x.indices, + x.data, + x.shape[0], + x.shape[1], + gram_matrix, + ), + ) + + copy_gram = _copy_kernel(x.data.dtype) + block = (32, 32) + grid = (math.ceil(x.shape[1] / block[0]), math.ceil(x.shape[1] / block[1])) + copy_gram( + grid, + block, + (gram_matrix, x.shape[1]), + ) + + mean_x = x.sum(axis=0) * (1 / x.shape[0]) + gram_matrix *= 1 / x.shape[0] + + if return_gram: + cov_result = cp.zeros( + (gram_matrix.shape[0], gram_matrix.shape[0]), + dtype=gram_matrix.dtype, + ) + else: + cov_result = gram_matrix + + compute_cov = _cov_kernel(x.dtype) + + block_size = (32, 32) + grid_size = (math.ceil(gram_matrix.shape[0] / 8),) * 2 + compute_cov( + grid_size, + block_size, + (cov_result, gram_matrix, mean_x, mean_x, gram_matrix.shape[0]), + ) + + if not return_gram and not return_mean: + return cov_result + elif return_gram and not return_mean: + return cov_result, gram_matrix + elif not return_gram and return_mean: + return cov_result, mean_x, mean_x + elif return_gram and return_mean: + return cov_result, gram_matrix, mean_x, mean_x diff --git a/rapids_singlecell/cunnData_funcs/_plotting.py b/rapids_singlecell/cunnData_funcs/_plotting.py index 7b2b0f7a..720a2681 100644 --- a/rapids_singlecell/cunnData_funcs/_plotting.py +++ b/rapids_singlecell/cunnData_funcs/_plotting.py @@ -1,10 +1,9 @@ -from ..cunnData import cunnData +import os -import numpy as np -import pandas as pd -import seaborn as sns import matplotlib.pyplot as plt -import os +import seaborn as sns + +from rapids_singlecell.cunnData import cunnData def scatter( @@ -21,7 +20,7 @@ def scatter( Wraps :func:`seaborn.scatterplot` for :class:`~rapids_singlecell.cunnData.cunnData`. This plotting function so far is really basic and doesnt include all the features form :func:`scanpy.pl.scatter`. Parameters - --------- + ---------- cudata cunnData object x @@ -38,7 +37,7 @@ def scatter( """ fig, ax = plt.subplots() - if color == None: + if color is None: sns.scatterplot(data=cudata.obs, x=x, y=y, s=2, color="grey", edgecolor="grey") else: sns.scatterplot(data=cudata.obs, x=x, y=y, s=2, hue=color) @@ -65,7 +64,7 @@ def violin( Wraps :func:`seaborn.violinplot` for :class:`~rapids_singlecell.cunnData.cunnData`. This plotting function so far is really basic and doesnt include all the features form :func:`scanpy.pl.violin`. Parameters - --------- + ---------- cudata cunnData object key @@ -82,14 +81,14 @@ def violin( The resolution in dots per inch for save Returns - ------ + ------- nothing """ fig, ax = plt.subplots() - ax = sns.violinplot(data=cudata.obs, y=key, scale="width", x=groupby, inner=None) + sns.violinplot(data=cudata.obs, y=key, scale="width", x=groupby, inner=None) if size: - ax = sns.stripplot( + sns.stripplot( data=cudata.obs, y=key, x=groupby, diff --git a/rapids_singlecell/cunnData_funcs/_regress_out.py b/rapids_singlecell/cunnData_funcs/_regress_out.py index 3da3a2a0..a791422a 100644 --- a/rapids_singlecell/cunnData_funcs/_regress_out.py +++ b/rapids_singlecell/cunnData_funcs/_regress_out.py @@ -1,10 +1,11 @@ +import math +from typing import Literal, Optional, Union + import cupy as cp import cupyx as cpx from cuml.linear_model import LinearRegression + from rapids_singlecell.cunnData import cunnData -from typing import Literal, Union, Optional -from ..cunnData import cunnData -import math def regress_out( @@ -47,7 +48,6 @@ def regress_out( Returns a corrected copy or updates `cudata` with a corrected version of the \ original `cudata.X` and `cudata.layers['layer']`, depending on `inplace`. """ - if batchsize != "all" and type(batchsize) not in [int, type(None)]: raise ValueError("batchsize must be `int`, `None` or `'all'`") @@ -60,7 +60,7 @@ def regress_out( if type(keys) is list: dim_regressor = len(keys) + 1 - regressors = cp.ones((X.shape[0] * dim_regressor)).reshape( + regressors = cp.ones(X.shape[0] * dim_regressor).reshape( (X.shape[0], dim_regressor), order="F" ) if dim_regressor == 2: @@ -103,7 +103,7 @@ def regress_out( X = X.todense() for i in range(X.shape[1]): if verbose and i % 500 == 0: - print("Regressed %s out of %s" % (i, X.shape[1])) + print(f"Regressed {i} out of {X.shape[1]}") y = X[:, i] outputs[:, i] = _regress_out_chunk(regressors, y) @@ -121,12 +121,14 @@ def _regress_out_chunk(X, y): """ Performs a data_cunk.shape[1] number of local linear regressions, replacing the data in the original chunk w/ the regressed result. + Parameters ---------- X : cupy.ndarray of shape (n_cells, 3) Matrix of regressors y : cupy.sparse.spmatrix of shape (n_cells,) Sparse matrix containing a single column of the cellxgene matrix + Returns ------- dense_mat : cupy.ndarray of shape (n_cells,) diff --git a/rapids_singlecell/cunnData_funcs/_scale.py b/rapids_singlecell/cunnData_funcs/_scale.py index 36fa9ee7..da8d08bd 100644 --- a/rapids_singlecell/cunnData_funcs/_scale.py +++ b/rapids_singlecell/cunnData_funcs/_scale.py @@ -1,7 +1,9 @@ -import cupy as cp -from ..cunnData import cunnData from typing import Optional +import cupy as cp + +from rapids_singlecell.cunnData import cunnData + def scale( cudata: cunnData, diff --git a/rapids_singlecell/cunnData_funcs/_simple.py b/rapids_singlecell/cunnData_funcs/_simple.py index e34dc00d..309e8dbe 100644 --- a/rapids_singlecell/cunnData_funcs/_simple.py +++ b/rapids_singlecell/cunnData_funcs/_simple.py @@ -1,11 +1,11 @@ +import math +from typing import Union + import cupy as cp import cupyx as cpx -import math import numpy as np -import pandas as pd -import math -from ..cunnData import cunnData -from typing import Union + +from rapids_singlecell.cunnData import cunnData _sparse_qc_kernel_csc = cp.RawKernel( r""" @@ -213,7 +213,6 @@ def calculate_qc_metrics( E.g. 'pct_dropout_by_counts'. Percentage of cells this feature does not appear in. """ - X = cudata.X sums_cells = cp.zeros(X.shape[0], dtype=cp.float32) sums_genes = cp.zeros(X.shape[1], dtype=cp.float32) @@ -404,7 +403,6 @@ def filter_genes( a filtered :class:`~rapids_singlecell.cunnData.cunnData` object inplace """ - if qc_var in cudata.var.keys(): if min_count is not None and max_count is not None: thr = np.where( @@ -429,7 +427,7 @@ def filter_genes( "pct_dropout_by_counts", ]: print( - f"Running `calculate_qc_metrics` for 'n_cells_by_counts','total_counts','mean_counts' or 'pct_dropout_by_counts'" + "Running `calculate_qc_metrics` for 'n_cells_by_counts','total_counts','mean_counts' or 'pct_dropout_by_counts'" ) calculate_qc_metrics(cudata=cudata, log1p=False) if min_count is not None and max_count is not None: @@ -448,7 +446,7 @@ def filter_genes( cudata._inplace_subset_var(thr) else: - print(f"please check qc_var") + print("please check qc_var") def filter_cells( @@ -493,13 +491,13 @@ def filter_cells( elif max_count is not None: inter = np.where(cudata.obs[qc_var] < max_count)[0] else: - print(f"Please specify a cutoff to filter against") + print("Please specify a cutoff to filter against") if verbose: print(f"filtered out {cudata.obs.shape[0]-inter.shape[0]} cells") cudata._inplace_subset_obs(inter) elif qc_var in ["n_genes_by_counts", "total_counts"]: print( - f"Running `calculate_qc_metrics` for 'n_cells_by_counts' or 'total_counts'" + "Running `calculate_qc_metrics` for 'n_cells_by_counts' or 'total_counts'" ) calculate_qc_metrics(cudata, log1p=False) inter = np.array @@ -512,12 +510,12 @@ def filter_cells( elif max_count is not None: inter = np.where(cudata.obs[qc_var] < max_count)[0] else: - print(f"Please specify a cutoff to filter against") + print("Please specify a cutoff to filter against") if verbose: print(f"filtered out {cudata.obs.shape[0]-inter.shape[0]} cells") cudata._inplace_subset_obs(inter) else: - print(f"Please check qc_var.") + print("Please check qc_var.") def filter_highly_variable(cudata: cunnData) -> None: @@ -530,7 +528,7 @@ def filter_highly_variable(cudata: cunnData) -> None: """ if "highly_variable" in cudata.var.keys(): - thr = np.where(cudata.var["highly_variable"] == True)[0] + thr = np.where(cudata.var["highly_variable"] is True)[0] cudata._inplace_subset_var(thr) else: - print(f"Please calculate highly variable genes first") + print("Please calculate highly variable genes first") diff --git a/rapids_singlecell/cunnData_funcs/_utils.py b/rapids_singlecell/cunnData_funcs/_utils.py index 66acac3b..361d57eb 100644 --- a/rapids_singlecell/cunnData_funcs/_utils.py +++ b/rapids_singlecell/cunnData_funcs/_utils.py @@ -1,7 +1,8 @@ +import math + import cupy as cp import cupyx as cpx from cupyx.scipy.sparse import issparse -import math _get_mean_var_major = cp.RawKernel( r""" diff --git a/rapids_singlecell/decoupler_gpu/_method_mlm.py b/rapids_singlecell/decoupler_gpu/_method_mlm.py index 736706ed..265572c9 100644 --- a/rapids_singlecell/decoupler_gpu/_method_mlm.py +++ b/rapids_singlecell/decoupler_gpu/_method_mlm.py @@ -1,20 +1,18 @@ +from typing import Optional, Union + +import cupy as cp import numpy as np import pandas as pd -import cupy as cp - -from decoupler.pre import extract, match, rename_net, get_net_mat, filt_min_n - from anndata import AnnData +from decoupler.pre import extract, filt_min_n, get_net_mat, match, rename_net from scipy import stats - from tqdm import tqdm -from typing import Union, Optional def fit_mlm(X, y, inv, df): X = cp.ascontiguousarray(X) - n_samples = y.shape[1] - n_fsets = X.shape[1] + y.shape[1] + X.shape[1] coef, sse, _, _ = cp.linalg.lstsq(X, y, rcond=-1) if len(sse) == 0: raise ValueError( @@ -108,7 +106,6 @@ def run_mlm( **pvals** : DataFrame Obtained p-values. Stored in `.obsm['mlm_pvals']` if `mat` is AnnData. """ - # Extract sparse matrix and array of genes m, r, c = extract(mat, use_raw=use_raw, verbose=verbose) @@ -122,7 +119,7 @@ def run_mlm( if verbose: print( - "Running mlm on mat with {0} samples and {1} targets for {2} sources.".format( + "Running mlm on mat with {} samples and {} targets for {} sources.".format( m.shape[0], len(c), net.shape[1] ) ) diff --git a/rapids_singlecell/decoupler_gpu/_method_wsum.py b/rapids_singlecell/decoupler_gpu/_method_wsum.py index bd9601af..ca3b24f4 100644 --- a/rapids_singlecell/decoupler_gpu/_method_wsum.py +++ b/rapids_singlecell/decoupler_gpu/_method_wsum.py @@ -1,14 +1,12 @@ +from typing import Optional, Union + +import cupy as cp import numpy as np import pandas as pd -import cupy as cp - -from decoupler.pre import extract, match, rename_net, get_net_mat, filt_min_n - from anndata import AnnData +from decoupler.pre import extract, filt_min_n, get_net_mat, match, rename_net from tqdm import tqdm -from typing import Union, Optional - def run_perm(estimate, mat, net, idxs, times, seed): mat = cp.ascontiguousarray(mat) @@ -51,7 +49,7 @@ def wsum(mat, net, times, batch_size, seed, verbose): n_batches = int(np.ceil(n_samples / batch_size)) if verbose: - print("Infering activities on {0} batches.".format(n_batches)) + print(f"Infering activities on {n_batches} batches.") # Init empty acts estimate = np.zeros((n_samples, n_fsets), dtype=np.float32) @@ -139,7 +137,6 @@ def run_wsum( **pvals** : DataFrame Obtained p-values. Stored in `.obsm['wsum_pvals']` if `mat` is AnnData. """ - # Extract sparse matrix and array of genes m, r, c = extract(mat, use_raw=use_raw, verbose=verbose) # Transform net @@ -152,7 +149,7 @@ def run_wsum( if verbose: print( - "Running wsum on mat with {0} samples and {1} targets for {2} sources.".format( + "Running wsum on mat with {} samples and {} targets for {} sources.".format( m.shape[0], len(c), net.shape[1] ) ) diff --git a/rapids_singlecell/pl.py b/rapids_singlecell/pl.py index d7e7954a..946537b3 100644 --- a/rapids_singlecell/pl.py +++ b/rapids_singlecell/pl.py @@ -1,2 +1 @@ -from .cunnData_funcs._plotting import scatter -from .cunnData_funcs._plotting import violin +from .cunnData_funcs._plotting import * diff --git a/rapids_singlecell/scanpy_gpu/__init__.py b/rapids_singlecell/scanpy_gpu/__init__.py index 58916ba8..907c5862 100644 --- a/rapids_singlecell/scanpy_gpu/__init__.py +++ b/rapids_singlecell/scanpy_gpu/__init__.py @@ -1,9 +1,10 @@ -from ._clustering import leiden, louvain, kmeans +from rapids_singlecell.pp import pca + +from ._clustering import kmeans, leiden, louvain from ._diffmap import diffmap from ._draw_graph import draw_graph from ._embedding_density import embedding_density from ._harmony_integrate import harmony_integrate -from ..pp import pca from ._pymde import mde from ._rank_gene_groups import rank_genes_groups_logreg from ._tsne import tsne diff --git a/rapids_singlecell/scanpy_gpu/_clustering.py b/rapids_singlecell/scanpy_gpu/_clustering.py index 89f3e7ed..d6730bdd 100644 --- a/rapids_singlecell/scanpy_gpu/_clustering.py +++ b/rapids_singlecell/scanpy_gpu/_clustering.py @@ -1,19 +1,15 @@ -import cupy as cp +from typing import Optional + import cudf +import numpy as np +import pandas as pd +from anndata import AnnData from cugraph import Graph from cugraph import leiden as culeiden from cugraph import louvain as culouvain -import numpy as np -import pandas as pd from cuml.cluster import KMeans - -from anndata import AnnData - -from typing import Optional from natsort import natsorted -import warnings - def leiden( adata: AnnData, @@ -83,10 +79,10 @@ def leiden( ) # store information on the clustering parameters adata.uns["leiden"] = {} - adata.uns["leiden"]["params"] = dict( - resolution=resolution, - n_iterations=n_iterations, - ) + adata.uns["leiden"]["params"] = { + "resolution": resolution, + "n_iterations": n_iterations, + } def louvain( @@ -160,7 +156,7 @@ def louvain( categories=natsorted(map(str, np.unique(groups))), ) adata.uns["louvain"] = {} - adata.uns["louvain"]["params"] = dict(resolution=resolution) + adata.uns["louvain"]["params"] = {"resolution": resolution} def kmeans( @@ -184,7 +180,6 @@ def kmeans( state. """ - kmeans_out = KMeans(n_clusters=n_clusters, random_state=random_state).fit( adata.obsm["X_pca"] ) diff --git a/rapids_singlecell/scanpy_gpu/_diffmap.py b/rapids_singlecell/scanpy_gpu/_diffmap.py index 245e1228..2dc2d862 100644 --- a/rapids_singlecell/scanpy_gpu/_diffmap.py +++ b/rapids_singlecell/scanpy_gpu/_diffmap.py @@ -1,9 +1,9 @@ +import cupy as cp +import cupyx as cpx +import cupyx.scipy.sparse +import cupyx.scipy.sparse.linalg from anndata import AnnData from scipy.sparse import issparse -import cupyx.scipy.sparse.linalg -import cupyx.scipy.sparse -import cupyx as cpx -import cupy as cp def diffmap( @@ -37,7 +37,7 @@ def diffmap( Leave as is for the same behavior as sc.tl.diffmap Returns - ---------- + ------- updates `adata` with the following fields. `X_diffmap` : :class:`numpy.ndarray` (`adata.obsm`) @@ -47,7 +47,6 @@ def diffmap( Array of size (number of eigen vectors). Eigenvalues of transition matrix. """ - if neighbors_key: connectivities = adata.obsp[neighbors_key + "_connectivities"] else: diff --git a/rapids_singlecell/scanpy_gpu/_draw_graph.py b/rapids_singlecell/scanpy_gpu/_draw_graph.py index d384ec1e..9dc13659 100644 --- a/rapids_singlecell/scanpy_gpu/_draw_graph.py +++ b/rapids_singlecell/scanpy_gpu/_draw_graph.py @@ -1,9 +1,10 @@ -from anndata import AnnData +from typing import Union + import cudf import cugraph -import numpy as np import cupy as cp -from typing import Union +import numpy as np +from anndata import AnnData def draw_graph( @@ -31,13 +32,12 @@ def draw_graph( Above 1000 iterations is discouraged. Returns - ---------- + ------- updates `adata` with the following fields. X_draw_graph_layout_fa : `adata.obsm` Coordinates of graph layout. """ - from cugraph.layout import force_atlas2 # Adjacency graph @@ -104,6 +104,6 @@ def draw_graph( positions = cp.vstack((positions["x"].to_cupy(), positions["y"].to_cupy())).T layout = "fa" adata.uns["draw_graph"] = {} - adata.uns["draw_graph"]["params"] = dict(layout=layout, random_state=0) + adata.uns["draw_graph"]["params"] = {"layout": layout, "random_state": 0} key_added = f"X_draw_graph_{layout}" adata.obsm[key_added] = positions.get() # Format output diff --git a/rapids_singlecell/scanpy_gpu/_embedding_density.py b/rapids_singlecell/scanpy_gpu/_embedding_density.py index dbbc278c..23f344fd 100644 --- a/rapids_singlecell/scanpy_gpu/_embedding_density.py +++ b/rapids_singlecell/scanpy_gpu/_embedding_density.py @@ -1,6 +1,6 @@ -from anndata import AnnData import cupy as cp import numpy as np +from anndata import AnnData def embedding_density( @@ -41,6 +41,7 @@ def embedding_density( components The embedding dimensions over which the density should be calculated. This is limited to two components. + Returns ------- Updates `adata.obs` with an additional field specified by the `key_added` \ @@ -122,9 +123,10 @@ def embedding_density( if basis != "diffmap": components += 1 - adata.uns[f"{density_covariate}_params"] = dict( - covariate=groupby, components=components.tolist() - ) + adata.uns[f"{density_covariate}_params"] = { + "covariate": groupby, + "components": components.tolist(), + } def _calc_density(x: cp.ndarray, y: cp.ndarray): diff --git a/rapids_singlecell/scanpy_gpu/_harmonpy_gpu.py b/rapids_singlecell/scanpy_gpu/_harmonpy_gpu.py index 7b9cd7c6..4206bb23 100644 --- a/rapids_singlecell/scanpy_gpu/_harmonpy_gpu.py +++ b/rapids_singlecell/scanpy_gpu/_harmonpy_gpu.py @@ -16,12 +16,12 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import pandas as pd -import numpy as np -import cupy as cp +import logging +import cupy as cp +import numpy as np +import pandas as pd from cuml import KMeans -import logging # create logger logger = logging.getLogger("harmonypy_gpu") @@ -56,7 +56,6 @@ def run_harmony( random_state=0, ): """Run Harmony.""" - # theta = None # lamb = None # sigma = 0.1 @@ -143,7 +142,7 @@ def run_harmony( return ho -class Harmony(object): +class Harmony: def __init__( self, Z, @@ -249,7 +248,7 @@ def harmonize(self, iter_harmony=10, verbose=True): converged = False for i in range(1, iter_harmony + 1): if verbose: - logger.info("Iteration {} of {}".format(i, iter_harmony)) + logger.info(f"Iteration {i} of {iter_harmony}") # STEP 1: Clustering self.cluster() # STEP 2: Regress out covariates diff --git a/rapids_singlecell/scanpy_gpu/_pymde.py b/rapids_singlecell/scanpy_gpu/_pymde.py index 4757ecdd..ae4b8799 100644 --- a/rapids_singlecell/scanpy_gpu/_pymde.py +++ b/rapids_singlecell/scanpy_gpu/_pymde.py @@ -1,6 +1,7 @@ -from anndata import AnnData -from typing import Optional, Literal +from typing import Literal, Optional + import pandas as pd +from anndata import AnnData def mde( @@ -46,12 +47,12 @@ def mde( Agrawal, Akshay, Alnur Ali, and Stephen Boyd. "Minimum-distortion embedding." arXiv preprint arXiv:2103.02559 (2021). """ try: - import torch import pymde + import torch except ImportError: raise ImportError("Please install pymde package via `pip install pymde`") - if use_rep == None: + if use_rep is None: data = adata.obsm["X_pca"] else: data = adata.obsm[use_rep] @@ -62,14 +63,14 @@ def mde( data = data[:, :n_pcs] device = "cpu" if not torch.cuda.is_available() else "cuda" - _kwargs = dict( - embedding_dim=2, - constraint=pymde.Standardized(), - repulsive_fraction=0.7, - verbose=False, - device=device, - n_neighbors=n_neighbors, - ) + _kwargs = { + "embedding_dim": 2, + "constraint": pymde.Standardized(), + "repulsive_fraction": 0.7, + "verbose": False, + "device": device, + "n_neighbors": n_neighbors, + } _kwargs.update(kwargs) emb = pymde.preserve_neighbors(data, **_kwargs).embed(verbose=_kwargs["verbose"]) diff --git a/rapids_singlecell/scanpy_gpu/_rank_gene_groups.py b/rapids_singlecell/scanpy_gpu/_rank_gene_groups.py index 625d57f3..1278c99e 100644 --- a/rapids_singlecell/scanpy_gpu/_rank_gene_groups.py +++ b/rapids_singlecell/scanpy_gpu/_rank_gene_groups.py @@ -1,9 +1,9 @@ +from typing import Iterable, Literal, Union + import cupy as cp -from anndata import AnnData import numpy as np import pandas as pd -import warnings -from typing import Iterable, Union, Optional, Literal +from anndata import AnnData def _select_groups(labels, groups_order_subset="all"): @@ -75,7 +75,7 @@ def rank_genes_groups_logreg( Key from `adata.layers` whose value will be used to perform tests on. Returns - -------- + ------- Updates `adata` with the following fields. **names** : structured `np.ndarray` (`.uns['rank_genes_groups']`) @@ -93,11 +93,10 @@ def rank_genes_groups_logreg( **pvals_adj** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Corrected p-values. """ - #### Wherever we see "adata.obs[groupby], we should just replace w/ the groups" # for clarity, rename variable - if groups == "all" or groups == None: + if groups == "all" or groups is None: groups_order = "all" elif isinstance(groups, (str, int)): raise ValueError("Specify a sequence of groups") @@ -116,16 +115,16 @@ def rank_genes_groups_logreg( groups_order, groups_masks = _select_groups(labels, groups_order) - if layer and use_raw == True: + if layer and use_raw is True: raise ValueError("Cannot specify `layer` and have `use_raw=True`.") elif layer: X = adata.layers[layer] var_names = adata.var_names - elif use_raw == None and adata.raw: + elif use_raw is None and adata.raw: print("defaulting to using `.raw`") X = adata.raw.X var_names = adata.raw.var_names - elif use_raw == True: + elif use_raw is True: X = adata.raw.X var_names = adata.raw.var_names else: @@ -135,7 +134,7 @@ def rank_genes_groups_logreg( # for clarity, rename variable n_genes_user = n_genes # make sure indices are not OoB in case there are less genes than n_genes - if n_genes == None or n_genes_user > X.shape[1]: + if n_genes is None or n_genes_user > X.shape[1]: n_genes_user = X.shape[1] # in the following, n_genes is simply another name for the total number of genes @@ -178,7 +177,7 @@ def rank_genes_groups_logreg( if len(groups_order) == scores_all.shape[1]: scores_all = scores_all.T - for igroup, group in enumerate(groups_order): + for igroup, _group in enumerate(groups_order): if len(groups_order) <= 2: # binary logistic regression scores = scores_all[0] else: @@ -197,17 +196,20 @@ def rank_genes_groups_logreg( groups_order_save = [groups_order_save[0]] scores = np.rec.fromarrays( - [n for n in rankings_gene_scores], + list(rankings_gene_scores), dtype=[(rn, "float32") for rn in groups_order_save], ) names = np.rec.fromarrays( - [n for n in rankings_gene_names], + list(rankings_gene_names), dtype=[(rn, "U50") for rn in groups_order_save], ) adata.uns["rank_genes_groups"] = {} - adata.uns["rank_genes_groups"]["params"] = dict( - groupby=groupby, method="logreg", reference=refname, use_raw=use_raw - ) + adata.uns["rank_genes_groups"]["params"] = { + "groupby": groupby, + "method": "logreg", + "reference": refname, + "use_raw": use_raw, + } adata.uns["rank_genes_groups"]["scores"] = scores adata.uns["rank_genes_groups"]["names"] = names diff --git a/rapids_singlecell/scanpy_gpu/_tsne.py b/rapids_singlecell/scanpy_gpu/_tsne.py index 6479c6e9..24107506 100644 --- a/rapids_singlecell/scanpy_gpu/_tsne.py +++ b/rapids_singlecell/scanpy_gpu/_tsne.py @@ -1,6 +1,5 @@ from anndata import AnnData from cuml.manifold import TSNE -from typing import Optional def tsne( @@ -17,7 +16,7 @@ def tsne( Performs t-distributed stochastic neighborhood embedding (tSNE) using cuML libraray. Parameters - --------- + ---------- adata Annotated data matrix. n_pcs @@ -54,7 +53,7 @@ def tsne( **X_tsne** : `np.ndarray` (`adata.obs`, dtype `float`) tSNE coordinates of data. """ - if use_rep == None: + if use_rep is None: data = adata.obsm["X_pca"] else: data = adata.obsm[use_rep] diff --git a/rapids_singlecell/squidpy_gpu/_autocorr.py b/rapids_singlecell/squidpy_gpu/_autocorr.py index dd30863e..3e6c9281 100644 --- a/rapids_singlecell/squidpy_gpu/_autocorr.py +++ b/rapids_singlecell/squidpy_gpu/_autocorr.py @@ -1,20 +1,21 @@ -from anndata import AnnData -import pandas as pd -import numpy as np from typing import ( - TYPE_CHECKING, Literal, # < 3.8 + Optional, Sequence, Union, - Optional, ) -from scipy import sparse + import cupy as cp import cupyx as cpx -from ._moransi import _morans_I_cupy +import numpy as np +import pandas as pd +from anndata import AnnData +from scipy import sparse +from statsmodels.stats.multitest import multipletests + from ._gearysc import _gearys_C_cupy +from ._moransi import _morans_I_cupy from ._utils import _p_value_calc -from statsmodels.stats.multitest import multipletests def spatial_autocorr( @@ -76,7 +77,6 @@ def spatial_autocorr( DataFrame containing the autocorrelation scores, p-values, and corrected p-values for each gene. \ If `copy` is False, the results are stored in `adata.uns` and None is returned. """ - if genes is None: if "highly_variable" in adata.var: genes = adata[:, adata.var["highly_variable"]].var_names.values diff --git a/rapids_singlecell/squidpy_gpu/_gearysc.py b/rapids_singlecell/squidpy_gpu/_gearysc.py index de7db53b..a300ccc7 100644 --- a/rapids_singlecell/squidpy_gpu/_gearysc.py +++ b/rapids_singlecell/squidpy_gpu/_gearysc.py @@ -1,7 +1,7 @@ -import cupy as cp -import cupyx as cpx import math +import cupy as cp +import cupyx as cpx kernel_gearys_C_num_dense = r""" extern "C" __global__ void gearys_C_num_dense(const float* data, diff --git a/rapids_singlecell/squidpy_gpu/_ligrec.py b/rapids_singlecell/squidpy_gpu/_ligrec.py index 654e0648..957798ff 100644 --- a/rapids_singlecell/squidpy_gpu/_ligrec.py +++ b/rapids_singlecell/squidpy_gpu/_ligrec.py @@ -1,22 +1,23 @@ -import cupy as cp -import numpy as np -import pandas as pd -from scipy.sparse import csc_matrix, issparse -from cupyx.scipy.sparse import issparse as cpissparse -import cupyx as cpx import math -from anndata import AnnData from itertools import product from typing import ( - Union, + Iterable, Literal, + Mapping, Optional, Sequence, - Mapping, - Iterable, + Union, ) -from ._utils import _create_sparse_df, _assert_categorical_obs +import cupy as cp +import cupyx as cpx +import numpy as np +import pandas as pd +from anndata import AnnData +from cupyx.scipy.sparse import issparse as cpissparse +from scipy.sparse import csc_matrix, issparse + +from ._utils import _assert_categorical_obs, _create_sparse_df SOURCE = "source" TARGET = "target" @@ -696,7 +697,7 @@ def find_min_gene_in_complex(_complex: str | None) -> str | None: (len(interactions_), len(interaction_clusters)), dtype=np.float32, order="C" ) if cpissparse(data_cp): - for i in range(n_perms): + for _i in range(n_perms): cp.random.shuffle(clustering_use) g = cp.zeros((data_cp.shape[1], n_cls), dtype=cp.float32, order="C") mean_kernel_sparse( @@ -734,7 +735,7 @@ def find_min_gene_in_complex(_complex: str | None) -> str | None: ), ) else: - for i in range(n_perms): + for _i in range(n_perms): cp.random.shuffle(clustering_use) g = cp.zeros((data_cp.shape[1], n_cls), dtype=cp.float32, order="C") mean_kernel( diff --git a/rapids_singlecell/squidpy_gpu/_moransi.py b/rapids_singlecell/squidpy_gpu/_moransi.py index 05c84c30..480ccad9 100644 --- a/rapids_singlecell/squidpy_gpu/_moransi.py +++ b/rapids_singlecell/squidpy_gpu/_moransi.py @@ -1,6 +1,7 @@ +import math + import cupy as cp import cupyx as cpx -import math kernel_morans_I_num_dense = r""" extern "C" __global__ diff --git a/rapids_singlecell/squidpy_gpu/_utils.py b/rapids_singlecell/squidpy_gpu/_utils.py index 8a9d01cf..be78a8cf 100644 --- a/rapids_singlecell/squidpy_gpu/_utils.py +++ b/rapids_singlecell/squidpy_gpu/_utils.py @@ -1,14 +1,15 @@ -import numpy as np from typing import ( TYPE_CHECKING, Any, Dict, Union, ) -from scipy import stats -from scipy.sparse import spmatrix, issparse + +import numpy as np import pandas as pd from pandas.api.types import infer_dtype, is_categorical_dtype +from scipy import stats +from scipy.sparse import issparse, spmatrix ### Taken from squidpy: https://github.com/scverse/squidpy/blob/main/squidpy/gr/_ppatterns.py @@ -20,6 +21,7 @@ def _p_value_calc( ): """ Handle p-value calculation for spatial autocorrelation function. + Parameters ---------- score @@ -28,6 +30,7 @@ def _p_value_calc( (n_simulations, n_features). params Object to store relevant function parameters. + Returns ------- pval_norm diff --git a/rapids_singlecell/tests/test_autocorr.py b/rapids_singlecell/tests/test_autocorr.py index 4c9ff4bf..1787ea7b 100644 --- a/rapids_singlecell/tests/test_autocorr.py +++ b/rapids_singlecell/tests/test_autocorr.py @@ -1,9 +1,9 @@ -import pytest -from rapids_singlecell.gr import spatial_autocorr -import pandas as pd -from anndata import read_h5ad from pathlib import Path + import numpy as np +import pytest +from anndata import read_h5ad +from rapids_singlecell.gr import spatial_autocorr MORAN_I = "moranI" GEARY_C = "gearyC" diff --git a/rapids_singlecell/tests/test_cunndata.py b/rapids_singlecell/tests/test_cunndata.py index e7196f47..2944d319 100644 --- a/rapids_singlecell/tests/test_cunndata.py +++ b/rapids_singlecell/tests/test_cunndata.py @@ -1,18 +1,16 @@ -import scanpy as sc -import rapids_singlecell as rsc -from rapids_singlecell.cunnData import cunnData -from anndata import AnnData -import numpy as np import cupy as cp +import cupyx.scipy.sparse as spg +import numpy as np import pandas as pd import pytest +from anndata import AnnData +from rapids_singlecell.cunnData import cunnData from scipy import sparse as sp -import cupyx.scipy.sparse as spg def test_create_with_dfs(): X = cp.ones((6, 3)) - obs = pd.DataFrame(dict(cat_anno=pd.Categorical(["a", "a", "a", "a", "b", "a"]))) + obs = pd.DataFrame({"cat_anno": pd.Categorical(["a", "a", "a", "a", "b", "a"])}) obs_copy = obs.copy() adata = cunnData(X=X, obs=obs) assert obs.index.equals(obs_copy.index) @@ -23,11 +21,11 @@ def test_creation(): cunnData(X=np.array([[1, 2], [3, 4]])) cunnData(X=np.array([[1, 2], [3, 4]]), obs={}, var={}) X = np.array([[1, 2, 3], [4, 5, 6]]) - adata = cunnData( + cunnData( X=X, - obs=dict(Obs=["A", "B"]), - var=dict(Feat=["a", "b", "c"]), - obsm=dict(X_pca=np.array([[1, 2], [3, 4]])), + obs={"Obs": ["A", "B"]}, + var={"Feat": ["a", "b", "c"]}, + obsm={"X_pca": np.array([[1, 2], [3, 4]])}, ) diff --git a/rapids_singlecell/tests/test_decoupler.py b/rapids_singlecell/tests/test_decoupler.py index 62d22bb3..718487c7 100644 --- a/rapids_singlecell/tests/test_decoupler.py +++ b/rapids_singlecell/tests/test_decoupler.py @@ -1,10 +1,10 @@ import numpy as np import pandas as pd -from scipy.sparse import csr_matrix from anndata import AnnData from rapids_singlecell.dcg import run_mlm, run_wsum from rapids_singlecell.decoupler_gpu._method_mlm import mlm from rapids_singlecell.decoupler_gpu._method_wsum import wsum +from scipy.sparse import csr_matrix def test_mlm(): @@ -20,7 +20,7 @@ def test_mlm(): ) ) net = np.array([[1.0, 0.0], [2, 0.0], [0.0, -3.0], [0.0, 4.0]], dtype=np.float32) - res = mlm(m, net) + mlm(m, net) def test_run_mlm(): @@ -56,7 +56,7 @@ def test_wsum(): ) ) net = np.array([[1.0, 0.0], [2, 0.0], [0.0, -3.0], [0.0, 4.0]], dtype=np.float32) - res = wsum(m, net, 2, 10000, 42, True) + wsum(m, net, 2, 10000, 42, True) def test_run_wsum(): diff --git a/rapids_singlecell/tests/test_emdedding_density.py b/rapids_singlecell/tests/test_emdedding_density.py index 357bcf7e..7d3a95b7 100644 --- a/rapids_singlecell/tests/test_emdedding_density.py +++ b/rapids_singlecell/tests/test_emdedding_density.py @@ -1,6 +1,6 @@ import numpy as np -from anndata import AnnData import rapids_singlecell as rsc +from anndata import AnnData def test_embedding_density(): diff --git a/rapids_singlecell/tests/test_hvg.py b/rapids_singlecell/tests/test_hvg.py index 46e6fafb..ddc92618 100644 --- a/rapids_singlecell/tests/test_hvg.py +++ b/rapids_singlecell/tests/test_hvg.py @@ -1,13 +1,13 @@ -import scanpy as sc -import numpy as np +from pathlib import Path + import cupy as cp import cupyx as cpx -import rapids_singlecell as rsc -from rapids_singlecell.cunnData import cunnData +import numpy as np import pandas as pd import pytest - -from pathlib import Path +import rapids_singlecell as rsc +import scanpy as sc +from rapids_singlecell.cunnData import cunnData FILE = Path(__file__).parent / Path("_scripts/seurat_hvg.csv") FILE_V3 = Path(__file__).parent / Path("_scripts/seurat_hvg_v3.csv.gz") diff --git a/rapids_singlecell/tests/test_ligrec.py b/rapids_singlecell/tests/test_ligrec.py index 9a5add47..f7e18d87 100644 --- a/rapids_singlecell/tests/test_ligrec.py +++ b/rapids_singlecell/tests/test_ligrec.py @@ -1,18 +1,14 @@ -import sys +import pickle from itertools import product -from typing import TYPE_CHECKING, Mapping, Optional, Sequence, Tuple +from pathlib import Path +from typing import TYPE_CHECKING, Optional, Sequence, Tuple import numpy as np import pandas as pd import pytest import scanpy as sc from anndata import AnnData, read_h5ad -from pandas.testing import assert_frame_equal -from itertools import product - from rapids_singlecell.gr import ligrec -from pathlib import Path -import pickle _CK = "leiden" Interactions_t = Tuple[Sequence[str], Sequence[str]] @@ -90,13 +86,13 @@ def test_only_1_cluster(self, adata: AnnData, interactions: Interactions_t): adata.obs["foo"] = 1 adata.obs["foo"] = adata.obs["foo"].astype("category") with pytest.raises( - ValueError, match=rf"Expected at least `2` clusters, found `1`." + ValueError, match=r"Expected at least `2` clusters, found `1`." ): ligrec(adata, "foo", interactions=interactions) def test_invalid_complex_policy(self, adata: AnnData, interactions: Interactions_t): with pytest.raises( - ValueError, match=rf"Invalid option `'foobar'` for `ComplexPolicy`." + ValueError, match=r"Invalid option `'foobar'` for `ComplexPolicy`." ): ligrec(adata, _CK, interactions=interactions, complex_policy="foobar") diff --git a/rapids_singlecell/tests/test_normalization.py b/rapids_singlecell/tests/test_normalization.py index a54d71f5..b612708a 100644 --- a/rapids_singlecell/tests/test_normalization.py +++ b/rapids_singlecell/tests/test_normalization.py @@ -1,9 +1,9 @@ -import numpy as np import cupy as cp -from anndata import AnnData +import numpy as np +import pytest import rapids_singlecell as rsc +from anndata import AnnData from rapids_singlecell.cunnData import cunnData -import pytest from scipy.sparse import csr_matrix X_total = np.array([[1, 0], [3, 0], [5, 6]]) diff --git a/rapids_singlecell/tests/test_pca.py b/rapids_singlecell/tests/test_pca.py index 7554f774..6f505ec5 100644 --- a/rapids_singlecell/tests/test_pca.py +++ b/rapids_singlecell/tests/test_pca.py @@ -1,8 +1,9 @@ -import rapids_singlecell as rsc import numpy as np import pytest +import rapids_singlecell as rsc from anndata import AnnData from scanpy.datasets import pbmc3k_processed +from scipy import sparse A_list = [ [0, 0, 7, 0, 0], @@ -104,3 +105,30 @@ def test_pca_reproducible(): rsc.tl.pca(cpbmc) c = pbmc.obsm["X_pca"] np.array_equal(a, c) + + +def test_pca_sparse(): + sparse_ad = pbmc3k_processed() + default = pbmc3k_processed() + sparse_ad.X = sparse.csr_matrix(sparse_ad.X.astype(np.float64)) + default.X = default.X.astype(np.float64) + rsc.pp.pca(sparse_ad) + rsc.pp.pca(default) + + np.testing.assert_allclose( + np.abs(sparse_ad.obsm["X_pca"]), + np.abs(default.obsm["X_pca"]), + rtol=1e-7, + atol=1e-6, + ) + + np.testing.assert_allclose( + np.abs(sparse_ad.varm["PCs"]), np.abs(default.varm["PCs"]), rtol=1e-7, atol=1e-6 + ) + + np.testing.assert_allclose( + np.abs(sparse_ad.uns["pca"]["variance_ratio"]), + np.abs(default.uns["pca"]["variance_ratio"]), + rtol=1e-7, + atol=1e-6, + ) diff --git a/rapids_singlecell/tests/test_preprocessing.py b/rapids_singlecell/tests/test_preprocessing.py index 7ec68b29..4cb01468 100644 --- a/rapids_singlecell/tests/test_preprocessing.py +++ b/rapids_singlecell/tests/test_preprocessing.py @@ -1,6 +1,6 @@ -import scanpy as sc -import rapids_singlecell as rsc import cupy as cp +import rapids_singlecell as rsc +import scanpy as sc def test_scale(): diff --git a/rapids_singlecell/tests/test_qc_metrics.py b/rapids_singlecell/tests/test_qc_metrics.py index 1e882014..b5115802 100644 --- a/rapids_singlecell/tests/test_qc_metrics.py +++ b/rapids_singlecell/tests/test_qc_metrics.py @@ -1,12 +1,11 @@ -import scanpy as sc -import rapids_singlecell as rsc -from scipy import sparse -import numpy as np import cupy as cp +import numpy as np +import pandas as pd +import pytest +import rapids_singlecell as rsc from anndata import AnnData from cupyx.scipy import sparse as sparse_gpu -import pytest -import pandas as pd +from scipy import sparse def test_qc_metrics(): @@ -67,7 +66,7 @@ def test_qc_metrics(): def adata_mito(): a = np.random.binomial(100, 0.005, (1000, 1000)) init_var = pd.DataFrame( - dict(mito=np.concatenate((np.ones(100, dtype=bool), np.zeros(900, dtype=bool)))) + {"mito": np.concatenate((np.ones(100, dtype=bool), np.zeros(900, dtype=bool)))} ) adata_dense = AnnData(X=a, var=init_var.copy()) return adata_dense diff --git a/rapids_singlecell/tests/test_rank_genes_groups_logreg.py b/rapids_singlecell/tests/test_rank_genes_groups_logreg.py index 6ddca046..2480c0cc 100644 --- a/rapids_singlecell/tests/test_rank_genes_groups_logreg.py +++ b/rapids_singlecell/tests/test_rank_genes_groups_logreg.py @@ -1,6 +1,6 @@ import numpy as np -import scanpy as sc import rapids_singlecell as rsc +import scanpy as sc def test_rank_genes_groups_with_renamed_categories():