Skip to content

Commit

Permalink
more modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche committed Jun 18, 2024
1 parent f80a113 commit c11e5e0
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 63 deletions.
199 changes: 155 additions & 44 deletions src/cellarr/CellArrDataset.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,52 @@
import os
from typing import Union
from typing import List, Union, Sequence

import pandas as pd
import tiledb

from . import queryutils_tiledb_frame as qtd

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


class CellArrDataset:
"""A class that represent a collection of cells in TileDB."""
"""A class that represent a collection of cells and their associated metadata
in a TileDB backed store.
"""

def __init__(
self,
dataset_path: str,
counts_tdb_uri: str = "counts",
matrix_tdb_uri: str = "counts",
gene_metadata_uri: str = "gene_metadata",
cell_metadata_uri: str = "cell_metadata",
):
"""Initialize a ``CellArrDataset``.
Args:
dataset_path:
Path to the directory containing the tiledb files.
Usually the output_path used in the
Path to the directory containing the tiledb stores.
Usually the ``output_path`` from the
:py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
counts_tdb_uri:
Path to counts TileDB.
Relative path to matrix store.
gene_metadata_uri:
Path to gene metadata TileDB.
Relative path to gene metadata store.
cell_metadata_uri:
Path to cell metadata TileDB.
Relative path to cell metadata store.
"""

if not os.path.isdir(dataset_path):
raise ValueError("'dataset_path' is not a directory.")

self._dataset_path = dataset_path
# TODO: Maybe switch to on-demand loading of these objects
self._counts_tdb_tdb = tiledb.open(f"{dataset_path}/{counts_tdb_uri}", "r")
self._matrix_tdb_tdb = tiledb.open(f"{dataset_path}/{matrix_tdb_uri}", "r")
self._gene_metadata_tdb = tiledb.open(
f"{dataset_path}/{gene_metadata_uri}", "r"
)
Expand All @@ -50,67 +55,173 @@ def __init__(
)

def __del__(self):
self._counts_tdb_tdb.close()
self._matrix_tdb_tdb.close()
self._gene_metadata_tdb.close()
self._cell_metadata_tdb.close()

# TODO:
# Methods to implement
# search by gene
# search by cell metadata
# slice counts after search
def get_cell_metadata_columns(self) -> List[str]:
"""Get column names from ``cell_metadata`` store.
Returns:
List of available metadata columns.
"""
return qtd.get_schema_names_frame(self._cell_metadata_tdb)

def get_cell_metadata_columns(self):
columns = []
for i in range(self._cell_metadata_tdb.schema.nattr):
columns.append(self._cell_metadata_tdb.schema.attr(i).name)
def get_cell_metadata_column(self, column_name: str) -> list:
"""Access a column from the ``cell_metadata`` store.
return columns
Args:
column_name:
Name of the column or attribute. Usually one of the column names
from of :py:meth:`~get_cell_metadata_columns`.
def get_cell_metadata_column(self, column_name: str):
return self._cell_metadata_tdb.query(attrs=[column_name]).df[:]
Returns:
A list of values for this column.
"""
return qtd.get_a_column(self._cell_metadata_tdb, column_name=column_name)

def get_cell_subset(
self, subset: Union[slice, tiledb.QueryCondition], columns=None
):
if columns is None:
columns = self.get_cell_metadata_columns()
) -> pd.DataFrame:
"""Slice the ``cell_metadata`` store.
query = self._cell_metadata_tdb.query(cond=subset, attrs=columns)
data = query.df[:]
result = data.dropna()
return result
Args:
subset:
A list of integer indices to subset the ``cell_metadata``
store.
Alternatively, may also provide a
:py:class:`tiledb.QueryCondition` to query the store.
columns:
List of specific column names to access.
def get_gene_metadata_columns(self):
columns = []
for i in range(self._gene_metadata_tdb.schema.nattr):
columns.append(self._gene_metadata_tdb.schema.attr(i).name)
Defaults to None, in which case all columns are extracted.
Returns:
A pandas Dataframe of the subset.
"""
return qtd.subset_frame(self._cell_metadata_tdb, subset=subset, columns=columns)

return columns
def get_gene_metadata_columns(self) -> List[str]:
"""Get annotation column names from ``gene_metadata`` store.
Returns:
List of available annotations.
"""
return qtd.get_schema_names_frame(self._gene_metadata_tdb)

def get_gene_metadata_column(self, column_name: str):
return self._gene_metadata_tdb.query(attrs=[column_name]).df[:]
"""Access a column from the ``gene_metadata`` store.
Args:
column_name:
Name of the column or attribute. Usually one of the column names
from of :py:meth:`~get_gene_metadata_columns`.
Returns:
A list of values for this column.
"""
return qtd.get_a_column(self._gene_metadata_tdb, column_name=column_name)

def get_gene_metadata_index(self):
"""Get index of the ``gene_metadata`` store. This typically should store
all unique gene symbols.
Returns:
List of unique symbols.
"""
return qtd.get_index(self._gene_metadata_tdb)

def _get_indices_for_gene_list(self, query: list) -> List[int]:
_gene_index = self.get_gene_metadata_index()
return qtd._match_to_list(_gene_index, query=query)

def get_gene_subset(
self, subset: Union[slice, tiledb.QueryCondition], columns=None
self, subset: Union[slice, List[str], tiledb.QueryCondition], columns=None
):
if columns is None:
columns = self.get_gene_metadata_columns()
"""Slice the ``gene_metadata`` store.
Args:
subset:
A list of integer indices to subset the ``gene_metadata``
store.
Alternatively, may provide a
:py:class:`tiledb.QueryCondition` to query the store.
Alternatively, may provide a list of strings to match with
the index of ``gene_metadata`` store.
columns:
List of specific column names to access.
query = self._gene_metadata_tdb.query(cond=subset, attrs=columns)
data = query.df[:]
result = data.dropna()
return result
Defaults to None, in which case all columns are extracted.
Returns:
A pandas Dataframe of the subset.
"""

if qtd._is_list_strings(subset):
subset = self._get_indices_for_gene_list(subset)

return qtd.subset_frame(self._gene_metadata_tdb, subset=subset, columns=columns)

def get_slice(
self,
cell_subset: Union[slice, tiledb.QueryCondition],
gene_subset: Union[slice, tiledb.QueryCondition],
gene_subset: Union[slice, List[str], tiledb.QueryCondition],
):
_csubset = self.get_cell_subset(cell_subset)
_cell_indices = _csubset.index.tolist()

_gsubset = self.get_gene_subset(gene_subset)
_gene_indices = _gsubset.index.tolist()

return self._counts_tdb_tdb.multi_index[_cell_indices, _gene_indices]
return self._matrix_tdb_tdb.multi_index[_cell_indices, _gene_indices]

def __getitem__(
self,
args: Union[int, str, Sequence, tuple],
):
"""Subset a ``SummarizedExperiment``.
Args:
args:
Integer indices, a boolean filter, or (if the current object is
named) names specifying the ranges to be extracted, see
:py:meth:`~biocutils.normalize_subscript.normalize_subscript`.
Alternatively a tuple of length 1. The first entry specifies
the rows to retain based on their names or indices.
Alternatively a tuple of length 2. The first entry specifies
the rows to retain, while the second entry specifies the
columns to retain, based on their names or indices.
Raises:
ValueError:
If too many or too few slices provided.
Returns:
Same type as caller with the sliced rows and columns.
"""
if isinstance(args, (str, int)):
return self.get_slice(args, slice(None))

if isinstance(args, tuple):
if len(args) == 0:
raise ValueError("At least one slicing argument must be provided.")

if len(args) == 1:
return self.get_slice(args[0], slice(None))
elif len(args) == 2:
return self.get_slice(args[0], args[1])
else:
raise ValueError(
f"`{type(self).__name__}` only supports 2-dimensional slicing."
)

raise TypeError(
"args must be a sequence or a scalar integer or string or a tuple of atmost 2 values."
)
2 changes: 1 addition & 1 deletion src/cellarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@
finally:
del version, PackageNotFoundError

from .build_cellarrdataset import build_cellarrdataset
from .buildutils_cellarrdataset import build_cellarrdataset
from .CellArrDataset import CellArrDataset
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
- `gene_metadata`: A TileDB file containing gene metadata.
- `cell_metadata`: A TileDB file containing cell metadata.
- A matrix TileDB file named as specified by the `layer_matrix_name` parameter.
- A matrix TileDB file named by the `layer_matrix_name` parameter.
The TileDB matrix file is stored in a cell X gene orientation. This orientation
is chosen because the fastest-changing dimension as new files are added to the
Expand Down Expand Up @@ -62,8 +62,8 @@
import pandas as pd

from . import utils_anndata as uad
from . import utils_tiledb_array as uta
from . import utils_tiledb_frame as utf
from . import buildutils_tiledb_array as uta
from . import buildutils_tiledb_frame as utf
from .CellArrDataset import CellArrDataset

__author__ = "Jayaram Kancherla"
Expand Down Expand Up @@ -232,24 +232,25 @@ def build_cellarrdataset(

gene_set = sorted(gene_set)

gene_metadata = pd.DataFrame({"genes": gene_set}, index=gene_set)
gene_metadata = pd.DataFrame({"cellarr_gene_index": gene_set}, index=gene_set)
elif isinstance(gene_metadata, list):
_gene_list = sorted(list(set(gene_metadata)))
gene_metadata = pd.DataFrame({"genes": _gene_list}, index=_gene_list)
gene_metadata = pd.DataFrame({"cellarr_gene_index": _gene_list}, index=_gene_list)
elif isinstance(gene_metadata, dict):
_gene_list = sorted(list(gene_metadata.keys()))
gene_metadata = pd.DataFrame({"genes": _gene_list}, index=_gene_list)
gene_metadata = pd.DataFrame({"cellarr_gene_index": _gene_list}, index=_gene_list)
elif isinstance(gene_metadata, str):
gene_metadata = pd.read_csv(gene_metadata, index=True, header=True)

gene_metadata["genes_index"] = gene_metadata.index.tolist()
gene_metadata["cellarr_gene_index"] = gene_metadata.index.tolist()

if not isinstance(gene_metadata, pd.DataFrame):
raise TypeError("'gene_metadata' must be a pandas dataframe.")

if len(gene_metadata.index.unique()) != len(gene_metadata.index.tolist()):
raise ValueError("'gene_metadata' must contain a unique index.")

gene_metadata.reset_index(drop=True, inplace=True)

if num_genes is None:
num_genes = len(gene_metadata)

Expand Down Expand Up @@ -367,7 +368,7 @@ def build_cellarrdataset(
if optimize_tiledb:
uta.optimize_tiledb_array(_counts_uri)

return CellArrDataset(dataset_path=output_path, counts_tdb_uri=layer_matrix_name)
return CellArrDataset(dataset_path=output_path, matrix_tdb_uri=layer_matrix_name)


def generate_metadata_tiledb_frame(
Expand Down
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit c11e5e0

Please sign in to comment.