Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Jun 24, 2024
1 parent 475b762 commit 87ed2a6
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 20 deletions.
4 changes: 3 additions & 1 deletion src/cellarr/CellArrDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,4 +290,6 @@ def get_sample_subset(
Returns:
A pandas Dataframe of the subset.
"""
return qtd.subset_frame(self._sample_metadata_tdb, subset=subset, columns=columns)
return qtd.subset_frame(
self._sample_metadata_tdb, subset=subset, columns=columns
)
52 changes: 39 additions & 13 deletions src/cellarr/build_cellarrdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,9 @@ def build_cellarrdataset(
gene_annotation = pd.DataFrame({"cellarr_gene_index": gene_set}, index=gene_set)
elif isinstance(gene_annotation, list):
_gene_list = sorted(list(set(gene_annotation)))
gene_annotation = pd.DataFrame({"cellarr_gene_index": _gene_list}, index=_gene_list)
gene_annotation = pd.DataFrame(
{"cellarr_gene_index": _gene_list}, index=_gene_list
)
elif isinstance(gene_annotation, str):
gene_annotation = pd.read_csv(gene_annotation, index=True, header=True)
warnings.warn(
Expand All @@ -237,8 +239,12 @@ def build_cellarrdataset(
if not isinstance(gene_annotation, pd.DataFrame):
raise TypeError("'gene_annotation' must be a pandas dataframe.")

if len(gene_annotation["cellarr_gene_index"].unique()) != len(gene_annotation["cellarr_gene_index"].tolist()):
raise ValueError("'gene_annotation' must contain unique feature ids or gene symbols.")
if len(gene_annotation["cellarr_gene_index"].unique()) != len(
gene_annotation["cellarr_gene_index"].tolist()
):
raise ValueError(
"'gene_annotation' must contain unique feature ids or gene symbols."
)

gene_annotation.reset_index(drop=True, inplace=True)

Expand All @@ -249,7 +255,9 @@ def build_cellarrdataset(
_col_types[col] = "ascii"

_gene_output_uri = f"{output_path}/{gene_annotation_options.tiledb_store_name}"
generate_metadata_tiledb_frame(_gene_output_uri, gene_annotation, column_types=_col_types)
generate_metadata_tiledb_frame(
_gene_output_uri, gene_annotation, column_types=_col_types
)

if optimize_tiledb:
uta.optimize_tiledb_array(_gene_output_uri)
Expand Down Expand Up @@ -286,8 +294,12 @@ def build_cellarrdataset(
for col in sample_metadata.columns:
_col_types[col] = "ascii"

_sample_output_uri = f"{output_path}/{sample_metadata_options.tiledb_store_name}"
generate_metadata_tiledb_frame(_sample_output_uri, sample_metadata, column_types=_col_types)
_sample_output_uri = (
f"{output_path}/{sample_metadata_options.tiledb_store_name}"
)
generate_metadata_tiledb_frame(
_sample_output_uri, sample_metadata, column_types=_col_types
)

if optimize_tiledb:
uta.optimize_tiledb_array(_sample_output_uri)
Expand All @@ -307,7 +319,9 @@ def build_cellarrdataset(
_dataset.extend([f"dataset_{idx}" for _ in range(cci)])

if cell_metadata is None:
cell_metadata = pd.DataFrame({"cellarr_cell_counts": _cellindex_in_dataset, "cellarr_sample": _dataset})
cell_metadata = pd.DataFrame(
{"cellarr_cell_counts": _cellindex_in_dataset, "cellarr_sample": _dataset}
)
elif isinstance(cell_metadata, str):
warnings.warn(
"Scanning 'cell_metadata' csv file to count number of cells, this may take long",
Expand All @@ -329,7 +343,9 @@ def build_cellarrdataset(
)
elif isinstance(cell_metadata, pd.DataFrame):
if sum(cell_counts) != len(cell_metadata):
raise ValueError("Number of rows in 'cell_metadata' does not match the number of cells across files.")
raise ValueError(
"Number of rows in 'cell_metadata' does not match the number of cells across files."
)

cell_metadata["cellarr_sample"] = _dataset

Expand All @@ -339,15 +355,19 @@ def build_cellarrdataset(

if isinstance(cell_metadata, str):
_cell_metaframe = pd.read_csv(cell_metadata, chunksize=5, header=True)
generate_metadata_tiledb_csv(_cell_output_uri, cell_metadata, _cell_metaframe.columns)
generate_metadata_tiledb_csv(
_cell_output_uri, cell_metadata, _cell_metaframe.columns
)
elif isinstance(cell_metadata, pd.DataFrame):
_col_types = {}
for col in cell_metadata.columns:
_col_types[col] = "ascii"

_to_write = cell_metadata.astype(str)

generate_metadata_tiledb_frame(_cell_output_uri, _to_write, column_types=_col_types)
generate_metadata_tiledb_frame(
_cell_output_uri, _to_write, column_types=_col_types
)

if optimize_tiledb:
uta.optimize_tiledb_array(_cell_output_uri)
Expand Down Expand Up @@ -398,7 +418,9 @@ def build_cellarrdataset(
)


def generate_metadata_tiledb_frame(output_uri: str, input: pd.DataFrame, column_types: dict = None):
def generate_metadata_tiledb_frame(
output_uri: str, input: pd.DataFrame, column_types: dict = None
):
"""Generate metadata tiledb from a :pu:class:`~pandas.DataFrame`.
Args:
Expand All @@ -415,7 +437,9 @@ def generate_metadata_tiledb_frame(output_uri: str, input: pd.DataFrame, column_
Defaults to None.
"""
_to_write = input.astype(str)
utf.create_tiledb_frame_from_dataframe(output_uri, _to_write, column_types=column_types)
utf.create_tiledb_frame_from_dataframe(
output_uri, _to_write, column_types=column_types
)


def generate_metadata_tiledb_csv(
Expand Down Expand Up @@ -451,7 +475,9 @@ def generate_metadata_tiledb_csv(

for chunk in pd.read_csv(input, chunksize=chunksize, header=True):
if initfile:
utf.create_tiledb_frame_from_column_names(output_uri, chunk.columns, column_dtype)
utf.create_tiledb_frame_from_column_names(
output_uri, chunk.columns, column_dtype
)
initfile = False

_to_write = chunk.astype(str)
Expand Down
4 changes: 1 addition & 3 deletions src/cellarr/globalcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@


def remove_package_cache():
"""Remove the package cache that contains temporary store for
scanned files.
"""
"""Remove the package cache that contains temporary store for scanned files."""
global PACKAGE_SCAN_CACHE
PACKAGE_SCAN_CACHE = {}
3 changes: 1 addition & 2 deletions src/cellarr/utils_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,7 @@ def extract_anndata_info(
num_threads: int = 1,
force: bool = False,
):
"""Extract and generate the list of unique feature identifiers
and cell counts across files.
"""Extract and generate the list of unique feature identifiers and cell counts across files.
Args:
h5ad_or_adata:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@ def test_build_cellarrdataset():

sfp = tiledb.open(f"{tempdir}/sample_metadata", "r")
samples = sfp.df[:]
assert len(samples) == 2
assert len(samples) == 2

0 comments on commit 87ed2a6

Please sign in to comment.