Merge pull request #8 from Intron7/update-v0.3.2

Update v0.3.2
scverse · Jan 11, 2023 · 24b7494 · 24b7494
2 parents 087995a + 2c08d86
commit 24b7494
Show file tree

Hide file tree

Showing 13 changed files with 1,537 additions and 699 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ ipython kernel install --user --name=rapids_singelcell
 ```
 After you set up the enviroment you can install this package from this wheel into the enviroment. The wheel doesn't install any dependencies
 ```
-pip install https://github.com/Intron7/rapids_singlecell/releases/download/v0.3.1/rapids_singlecell-0.3.1-py3-none-any.whl
+pip install https://github.com/Intron7/rapids_singlecell/releases/download/v0.3.2/rapids_singlecell-0.3.2-py3-none-any.whl
 ```
 
 With this enviroment, you should be able to run the notebooks. So far I have tested these Notebooks on an A100 80GB, a Quadro RTX 6000 and a RTX 3090.
@@ -37,7 +37,7 @@ If you use the accelerated decoupler functions please cite [decoupler](https://d
 As of version `0.3.0` `rapids_singlecell` has been updated to use functions of and not methods of the class. To see the new recommended usage please check the notebooks.
 
 ### cunnData
-The preprocessing of the single-cell data is performed with `cunnData`. It is a replacement for the [AnnData](https://github.com/scverse/anndata) object used by scanpy. The `cunnData` object is a cutdown version of an `AnnData` object. At its core lies a sparse matrix (`.X`) within the GPU memory. `.obs` and `.var` are pandas data frame and `.uns` is a dictionary. It also supports `.layers` and `.obsm`. `.layers` are stored on the GPU, while `.obsm` is not.
+The preprocessing of the single-cell data is performed with `cunnData`. It is a replacement for the [AnnData](https://github.com/scverse/anndata) object used by scanpy. The `cunnData` object is a cutdown version of an `AnnData` object. At its core lies a sparse matrix (`.X`) within the GPU memory. `.obs` and `.var` are pandas data frame and `.uns` is a dictionary. It also supports `.layers`, `.varm` and `.obsm`. `.layers` are stored on the GPU, while `.obsm` and `.varm` are not.
 Since version `0.3.0` you can use cunnData for spatial transcriptomics datasets.\
 `cunnData` includes methods for:
 * `__getiem__` to filter the object based on `.obs` and `.var`. 
@@ -61,13 +61,14 @@ Please have look at the notebooks to assess the functionality. I tried to write
   * poisson_gene_selection (adapted from `scvi`)
 * regress_out
 * scale
+* PCA (PCA/ incremental PCA/ truncated svd)
 * some plotting functions of qc parameters
 
 
-### scanpy_gpu
+### scanpy_gpu or tl
 `scanpy_gpu` are functions that are written to directly work with an `AnnData` object and replace the scanpy counterpart by running on the GPU. Scanpy already supports GPU versions of `pp.neighbors` and `tl.umap` using rapids.\
 `scanpy_gpu` includes additional functions for:
-* PCA
+* PCA (PCA/ incremental PCA/ truncated svd)
 * Leiden Clustering
 * Louvain Clustering
 * TSNE

diff --git a/notebooks/data_downloader.ipynb b/notebooks/data_downloader.ipynb
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 10,
    "id": "c544c54c-4134-4aed-b1ff-87b2ebad540b",
    "metadata": {},
    "outputs": [],
@@ -58,26 +58,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "d3472f4a-9f39-4636-bb90-bb5726ebacf8",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n"
-     ]
-    },
-    {
-     "ename": "TypeError",
-     "evalue": "write() missing 1 required positional argument: 'adata'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Input \u001b[0;32mIn [3]\u001b[0m, in \u001b[0;36m<cell line: 5>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m adata \u001b[38;5;241m=\u001b[39m sc\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mh5/nvidia_1.3M.h5ad\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      4\u001b[0m adata \u001b[38;5;241m=\u001b[39m adata[:\u001b[38;5;241m200000\u001b[39m,:]\n\u001b[0;32m----> 5\u001b[0m \u001b[43msc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mh5/200000.h5ad\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
-      "\u001b[0;31mTypeError\u001b[0m: write() missing 1 required positional argument: 'adata'"
+      "/home/sedi10/conda/envs/rapids-22.12/lib/python3.9/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n",
+      "  utils.warn_names_duplicates(\"var\")\n"
      ]
     }
    ],
@@ -92,37 +82,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "id": "7f7a29c2-6955-4b2f-b62a-f49c0330eb87",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n"
+      "/home/sedi10/conda/envs/rapids-22.12/lib/python3.9/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n",
+      "  utils.warn_names_duplicates(\"var\")\n"
      ]
     }
    ],
    "source": [
     "adata = sc.read(\"h5/nvidia_1.3M.h5ad\")\n",
+    "adata.var_names_make_unique()\n",
     "adata = adata[:500000,:]\n",
     "adata.write(\"h5/500000.h5ad\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6270f9da-f596-4c31-b1f2-770372fcfbf7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "adata = sc.read(\"h5/nvidia_1.3M.h5ad\")\n",
-    "adata = adata[:200000,:]\n",
-    "adata.var_names_make_unique()\n",
-    "adata.write(\"h5/200000.h5ad\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -137,9 +116,9 @@
    "hash": "b70383e23f09abfebb324225aa0a73f4c8adec9f01b8b9679b19e76c26fcd2e4"
   },
   "kernelspec": {
-   "display_name": "rapids-22.04",
+   "display_name": "rapids-22.12",
    "language": "python",
-   "name": "rapids-22.04"
+   "name": "rapids-22.12"
   },
   "language_info": {
    "codemirror_mode": {
@@ -151,7 +130,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.9.15"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/demo_cpu-PR.ipynb b/notebooks/demo_cpu-PR.ipynb
diff --git a/notebooks/demo_cpu-seuratv3.ipynb b/notebooks/demo_cpu-seuratv3.ipynb
diff --git a/notebooks/demo_gpu-PR.ipynb b/notebooks/demo_gpu-PR.ipynb
diff --git a/notebooks/demo_gpu-seuratv3.ipynb b/notebooks/demo_gpu-seuratv3.ipynb
diff --git a/rapids_singlecell/__init__.py b/rapids_singlecell/__init__.py
@@ -3,4 +3,4 @@
 from . import decoupler_gpu as dcg
 from . import scanpy_gpu as tl
 
-__version__ = '0.3.1'
+__version__ = '0.3.2'
diff --git a/rapids_singlecell/cunnData/__init__.py b/rapids_singlecell/cunnData/__init__.py
@@ -13,8 +13,10 @@
 import math
 from scipy import sparse
 from typing import Any, Union, Optional, Mapping
+from pandas.api.types import infer_dtype
 
 import warnings
+from natsort import natsorted
 
 from scipy.sparse import issparse as issparse_cpu
 from cupyx.scipy.sparse import issparse as issparse_gpu
@@ -41,7 +43,7 @@ def __setitem__(self, key, item):
 
 class obsm_Mapping(dict):
     """
-    Dictonary subclass for layers handeling in cunnData
+    Dictonary subclass for obsm handeling in cunnData
     """
     def __init__(self, shape):
         super().__init__({})
@@ -54,7 +56,27 @@ def __setitem__(self, key, item):
         if self.shape == item.shape[0]:
             super().__setitem__(key, item)
         else:
-            raise ValueError(f"Shape of {key} does not match `.X`")
+            raise ValueError(f"Shape of {key} does not match `.n_obs`")
+
+class varm_Mapping(dict):
+    """
+    Dictonary subclass for obsm handeling in cunnData
+    """
+    def __init__(self, shape):
+        super().__init__({})
+        self.shape = shape
+
+    def update_shape(self,shape):
+        self.shape = shape
+
+    def __setitem__(self, key, item):
+        if self.shape == item.shape[0]:
+            super().__setitem__(key, item)
+        else:
+            raise ValueError(f"Shape of {key} does not match `.n_vars`")
+
+
+
 
 class cunnData:
     """
@@ -64,13 +86,14 @@ class cunnData:
     uns = {}
     def __init__(
         self,
+        adata: Optional[anndata.AnnData] = None,
         X: Optional[Union[np.ndarray,sparse.spmatrix, cp.array, cp.sparse.csr_matrix]] = None,
         obs: Optional[pd.DataFrame] = None,
         var: Optional[pd.DataFrame] = None,
         uns: Optional[Mapping[str, Any]] = None,
         layers: Optional[Mapping[str, Any]] = None,
         obsm: Optional[Mapping[str, Any]] = None,
-        adata: Optional[anndata.AnnData] = None):
+        varm: Optional[Mapping[str, Any]] = None):
             if adata:
                 if not issparse_cpu(adata.X):
                     inter = scipy.sparse.csr_matrix(adata.X)
@@ -83,6 +106,7 @@ def __init__(
                 self.uns = adata.uns.copy()
                 self.layers = Layer_Mapping(self.shape)
                 self.obsm = obsm_Mapping(self.shape[0])
+                self.varm = varm_Mapping(self.shape[1])
                 if adata.layers:
                     for key, matrix in adata.layers.items():
                         if not issparse_cpu(matrix):
@@ -96,6 +120,9 @@ def __init__(
                 if adata.obsm:
                     for key, matrix in adata.obsm.items():
                         self.obsm[key] = matrix
+                if adata.varm:
+                    for key, matrix in adata.varm.items():
+                        self.varm[key] = matrix
 
             else:
                 if issparse_gpu(X):
@@ -112,6 +139,9 @@ def __init__(
                 self.uns = uns
                 self.layers = Layer_Mapping(self.shape)
                 self.obsm = obsm_Mapping(self.shape[0])
+                self.varm = varm_Mapping(self.shape[1])
+                self.raw = None
+
                 if layers:
                     for key, matrix in layers.items():
                         if issparse_gpu(matrix):
@@ -126,6 +156,9 @@ def __init__(
                 if obsm:
                     for key, matrix in obsm.items():
                         self.obsm[key] = matrix.copy()
+                if varm:
+                    for key, matrix in adata.varm.items():
+                        self.varm[key] = matrix
 
     @property
     def shape(self):
@@ -149,11 +182,29 @@ def n_obs(self):
     @property
     def n_vars(self):
         return self.shape[1]
+
+    def _update_shape(self):
+        self.layers.update_shape(self.shape)
+        self.obsm.update_shape(self.shape[0])
+        self.varm.update_shape(self.shape[1])
+
+    def _sanitize(self):
+        dfs = [self.obs, self.var]
+        for df in dfs:
+            string_cols = [
+                key for key in df.columns if infer_dtype(df[key]) == "string"
+            ]
+            for key in string_cols:
+                c = pd.Categorical(df[key])
+                if len(c.categories) >= len(c):
+                    continue
+                # Ideally this could be done inplace
+                sorted_categories = natsorted(c.categories)
+                if not np.array_equal(c.categories, sorted_categories):
+                    c = c.reorder_categories(sorted_categories)
+                df[key] = c
 
     def __getitem__(self, index):
-        """
-        Currently only works for `obs`
-        """
         if type(index) == tuple:
             obs_dx, var_dx = index
         else:
@@ -167,8 +218,7 @@ def __getitem__(self, index):
             var_dx = var_dx.values
 
         self.X = self.X[obs_dx,var_dx]
-        self.layers.update_shape(self.shape)
-        self.obsm.update_shape(self.shape[0])
+        self._update_shape()
         if self.layers:
             for key, matrix in self.layers.items():
                 self.layers[key] = matrix[obs_dx, var_dx]
@@ -178,7 +228,37 @@ def __getitem__(self, index):
                     self.obsm[key] = matrix.iloc[obs_dx, :]
                 else:
                     self.obsm[key] = matrix[obs_dx, :]
-        return(cunnData(X = self.X,obs = self.obs.iloc[obs_dx,:],var = self.var.iloc[var_dx,:],uns=self.uns,layers= self.layers,obsm= self.obsm))
+        if self.varm:
+            for key, matrix in self.varm.items():
+                if isinstance(matrix, pd.DataFrame):
+                    self.varm[key] = matrix.iloc[var_dx, :]
+                else:
+                    self.varm[key] = matrix[var_dx, :]
+        return(cunnData(X = self.X,
+                        obs = self.obs.iloc[obs_dx,:],
+                        var = self.var.iloc[var_dx,:],
+                        uns=self.uns,
+                        layers= self.layers,
+                        obsm= self.obsm,
+                        varm= self.varm))
+
+    def _gen_repr(self, n_obs, n_vars) -> str:
+        descr = f"cunnData object with n_obs × n_vars = {n_obs} × {n_vars}"
+        for attr in [
+            "obs",
+            "var",
+            "uns",
+            "obsm",
+            "varm",
+            "layers",
+        ]:
+            keys = getattr(self, attr).keys()
+            if len(keys) > 0:
+                descr += f"\n    {attr}: {str(list(keys))[1:-1]}"
+        return descr
+
+    def __repr__(self) -> str:
+            return self._gen_repr(self.n_obs, self.n_vars)
 
 
     def to_AnnData(self):
@@ -200,4 +280,7 @@ def to_AnnData(self):
         if self.obsm:
             for key, matrix in self.obsm.items():
                 adata.obsm[key] = matrix.copy()
+        if self.varm:
+            for key, matrix in self.varm.items():
+                adata.varm[key] = matrix.copy()
         return adata
diff --git a/rapids_singlecell/cunnData_funcs/__init__.py b/rapids_singlecell/cunnData_funcs/__init__.py
@@ -1,5 +1,6 @@
 from ._regress_out import regress_out
 from ._scale import scale
+from ._pca import pca
 from ._hvg import highly_variable_genes
 from ._normalize import normalize_pearson_residuals, log1p, normalize_total
 from ._simple import filter_cells, filter_genes, filter_highly_variable