Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Jun 19, 2024
1 parent 376a5eb commit 25b096f
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 35 deletions.
8 changes: 4 additions & 4 deletions src/cellarr/CellArrDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@


class CellArrDataset:
"""A class that represent a collection of cells and their associated metadata
in a TileDB backed store.
"""
"""A class that represent a collection of cells and their associated metadata in a TileDB backed store."""

def __init__(
self,
Expand Down Expand Up @@ -164,7 +162,9 @@ def get_gene_subset(
if qtd._is_list_strings(subset):
subset = self._get_indices_for_gene_list(subset)

return qtd.subset_frame(self._gene_annotation_tdb, subset=subset, columns=columns)
return qtd.subset_frame(
self._gene_annotation_tdb, subset=subset, columns=columns
)

def get_slice(
self,
Expand Down
14 changes: 6 additions & 8 deletions src/cellarr/build_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

@dataclass
class CellMetadataOptions:
"""Optional arguments for the ``cell_metadata`` store
for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
"""Optional arguments for the ``cell_metadata`` store for
:py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
Attributes:
skip_cell_tiledb:
Expand All @@ -31,8 +31,8 @@ class CellMetadataOptions:

@dataclass
class GeneAnnotationOptions:
"""Optional arguments for the ``gene_annotation`` store
for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
"""Optional arguments for the ``gene_annotation`` store for
:py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
Attributes:
var_feature_column:
Expand All @@ -58,8 +58,7 @@ class GeneAnnotationOptions:

@dataclass
class MatrixOptions:
"""Optional arguments for the ``matrix`` store
for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
"""Optional arguments for the ``matrix`` store for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
Attributes:
layer_matrix_name:
Expand Down Expand Up @@ -87,8 +86,7 @@ class MatrixOptions:

@dataclass
class SampleMetadataOptions:
"""Optional arguments for the ``sample`` store
for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
"""Optional arguments for the ``sample`` store for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
Attributes:
skip_sample_tiledb:
Expand Down
61 changes: 43 additions & 18 deletions src/cellarr/buildutils_cellarrdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
from typing import List, Union

import anndata
import numpy as np
import pandas as pd

from . import utils_anndata as uad
Expand Down Expand Up @@ -96,12 +95,12 @@ def build_cellarrdataset(
and package.
There's a few assumptions this process makes:
- If object in ``files`` is an :py:class:`~anndata.AnnData`
or H5AD object, these must contain an assay matrix in layer
- If object in ``files`` is an :py:class:`~anndata.AnnData`
or H5AD object, these must contain an assay matrix in layer
names as ``layer_matrix_name`` parameter.
- Feature information must contain a column defined by
``var_feature_column`` in the
:py:class:`~cellarr.build_options.GeneAnnotationOptions.` that
``var_feature_column`` in the
:py:class:`~cellarr.build_options.GeneAnnotationOptions.` that
contains feature ids or gene symbols across all files.
- If no ``cell_metadata`` is provided, we scan to count the number of cells
and create a simple range index.
Expand Down Expand Up @@ -192,18 +191,26 @@ def build_cellarrdataset(
raise ValueError("'output_path' must be a directory.")

if gene_metadata is None:
warnings.warn("Scanning all files for gene symbols, this may take long", UserWarning)
gene_set = uad.scan_for_features(files, var_gene_column=var_gene_column, num_threads=num_threads)
warnings.warn(
"Scanning all files for gene symbols, this may take long", UserWarning
)
gene_set = uad.scan_for_features(
files, var_gene_column=var_gene_column, num_threads=num_threads
)

gene_set = sorted(gene_set)

gene_metadata = pd.DataFrame({"cellarr_gene_index": gene_set}, index=gene_set)
elif isinstance(gene_metadata, list):
_gene_list = sorted(list(set(gene_metadata)))
gene_metadata = pd.DataFrame({"cellarr_gene_index": _gene_list}, index=_gene_list)
gene_metadata = pd.DataFrame(
{"cellarr_gene_index": _gene_list}, index=_gene_list
)
elif isinstance(gene_metadata, dict):
_gene_list = sorted(list(gene_metadata.keys()))
gene_metadata = pd.DataFrame({"cellarr_gene_index": _gene_list}, index=_gene_list)
gene_metadata = pd.DataFrame(
{"cellarr_gene_index": _gene_list}, index=_gene_list
)
elif isinstance(gene_metadata, str):
gene_metadata = pd.read_csv(gene_metadata, index=True, header=True)
gene_metadata["cellarr_gene_index"] = gene_metadata.index.tolist()
Expand All @@ -226,7 +233,9 @@ def build_cellarrdataset(
_col_types[col] = "ascii"

_gene_output_uri = f"{output_path}/gene_metadata"
generate_metadata_tiledb_frame(_gene_output_uri, gene_metadata, column_types=_col_types)
generate_metadata_tiledb_frame(
_gene_output_uri, gene_metadata, column_types=_col_types
)

if optimize_tiledb:
uta.optimize_tiledb_array(_gene_output_uri)
Expand All @@ -242,7 +251,9 @@ def build_cellarrdataset(
_cellindex_in_dataset.extend([x for x in range(cci)])
_dataset.extend([f"dataset_{idx}" for _ in range(cci)])

_pseudo_cell_metadata = pd.DataFrame({"_cell_counts": _cellindex_in_dataset, "_cell_dataset_index": _dataset})
_pseudo_cell_metadata = pd.DataFrame(
{"_cell_counts": _cellindex_in_dataset, "_cell_dataset_index": _dataset}
)

if num_cells is None:
num_cells = sum(cell_counts)
Expand All @@ -268,23 +279,29 @@ def build_cellarrdataset(
)
elif isinstance(cell_metadata, pd.DataFrame):
if num_cells != len(cell_metadata):
raise ValueError("Number of rows in 'cell_metadata' does not match the number of cells across files.")
raise ValueError(
"Number of rows in 'cell_metadata' does not match the number of cells across files."
)

# Create the cell metadata tiledb
if not skip_cell_tiledb:
_cell_output_uri = f"{output_path}/cell_metadata"

if isinstance(cell_metadata, str):
_cell_metaframe = pd.read_csv(cell_metadata, chunksize=5, header=True)
generate_metadata_tiledb_csv(_cell_output_uri, cell_metadata, _cell_metaframe.columns)
generate_metadata_tiledb_csv(
_cell_output_uri, cell_metadata, _cell_metaframe.columns
)
elif isinstance(cell_metadata, pd.DataFrame):
_col_types = {}
for col in gene_metadata.columns:
_col_types[col] = "ascii"

_to_write = gene_metadata.astype(str)

generate_metadata_tiledb_frame(_cell_output_uri, _to_write, column_types=_col_types)
generate_metadata_tiledb_frame(
_cell_output_uri, _to_write, column_types=_col_types
)

if optimize_tiledb:
uta.optimize_tiledb_array(_cell_output_uri)
Expand Down Expand Up @@ -315,7 +332,9 @@ def build_cellarrdataset(
var_gene_column=var_gene_column,
layer_matrix_name=layer_matrix_name,
)
uta.write_csr_matrix_to_tiledb(_counts_uri, matrix=mat, row_offset=offset, value_dtype=matrix_dim_dtype)
uta.write_csr_matrix_to_tiledb(
_counts_uri, matrix=mat, row_offset=offset, value_dtype=matrix_dim_dtype
)
offset += int(mat.shape[0])

if optimize_tiledb:
Expand All @@ -324,7 +343,9 @@ def build_cellarrdataset(
return CellArrDataset(dataset_path=output_path, matrix_tdb_uri=layer_matrix_name)


def generate_metadata_tiledb_frame(output_uri: str, input: pd.DataFrame, column_types: dict = None):
def generate_metadata_tiledb_frame(
output_uri: str, input: pd.DataFrame, column_types: dict = None
):
"""Generate metadata tiledb from a :pu:class:`~pandas.DataFrame`.
Args:
Expand All @@ -341,7 +362,9 @@ def generate_metadata_tiledb_frame(output_uri: str, input: pd.DataFrame, column_
Defaults to None.
"""
_to_write = input.astype(str)
utf.create_tiledb_frame_from_dataframe(output_uri, _to_write, column_types=column_types)
utf.create_tiledb_frame_from_dataframe(
output_uri, _to_write, column_types=column_types
)


def generate_metadata_tiledb_csv(
Expand Down Expand Up @@ -377,7 +400,9 @@ def generate_metadata_tiledb_csv(

for chunk in pd.read_csv(input, chunksize=chunksize, header=True):
if initfile:
utf.create_tiledb_frame_from_column_names(output_uri, chunk.columns, column_dtype)
utf.create_tiledb_frame_from_column_names(
output_uri, chunk.columns, column_dtype
)
initfile = False

_to_write = chunk.astype(str)
Expand Down
10 changes: 5 additions & 5 deletions src/cellarr/utils_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def remap_anndata(
var_feature_column: str = "index",
layer_matrix_name: str = "counts",
) -> csr_matrix:
"""Extract and remap the count matrix to the provided feature (gene) set
order from the :py:class:`~anndata.AnnData` object.
"""Extract and remap the count matrix to the provided feature (gene) set order from the :py:class:`~anndata.AnnData`
object.
Args:
adata:
Expand All @@ -30,9 +30,9 @@ def remap_anndata(
for the columns in the matrix.
feature_set_order:
A dictionary with the feature ids as keys and their index as
value (e.g. gene symbols). The feature ids from the
``AnnData`` object are remapped to the feature order from
A dictionary with the feature ids as keys and their index as
value (e.g. gene symbols). The feature ids from the
``AnnData`` object are remapped to the feature order from
this dictionary.
var_feature_column:
Expand Down

0 comments on commit 25b096f

Please sign in to comment.