[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
BiocPy · Jun 19, 2024 · 25b096f · 25b096f
1 parent 376a5eb
commit 25b096f
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 35 deletions.
diff --git a/src/cellarr/CellArrDataset.py b/src/cellarr/CellArrDataset.py
@@ -12,9 +12,7 @@
 
 
 class CellArrDataset:
-    """A class that represent a collection of cells and their associated metadata 
-    in a TileDB backed store.
-    """
+    """A class that represent a collection of cells and their associated metadata in a TileDB backed store."""
 
     def __init__(
         self,
@@ -164,7 +162,9 @@ def get_gene_subset(
         if qtd._is_list_strings(subset):
             subset = self._get_indices_for_gene_list(subset)
 
-        return qtd.subset_frame(self._gene_annotation_tdb, subset=subset, columns=columns)
+        return qtd.subset_frame(
+            self._gene_annotation_tdb, subset=subset, columns=columns
+        )
 
     def get_slice(
         self,

diff --git a/src/cellarr/build_options.py b/src/cellarr/build_options.py
@@ -9,8 +9,8 @@
 
 @dataclass
 class CellMetadataOptions:
-    """Optional arguments for the ``cell_metadata`` store
-    for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+    """Optional arguments for the ``cell_metadata`` store for
+    :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
 
     Attributes:
         skip_cell_tiledb:
@@ -31,8 +31,8 @@ class CellMetadataOptions:
 
 @dataclass
 class GeneAnnotationOptions:
-    """Optional arguments for the ``gene_annotation`` store
-    for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+    """Optional arguments for the ``gene_annotation`` store for
+    :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
 
     Attributes:
         var_feature_column:
@@ -58,8 +58,7 @@ class GeneAnnotationOptions:
 
 @dataclass
 class MatrixOptions:
-    """Optional arguments for the ``matrix`` store
-    for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+    """Optional arguments for the ``matrix`` store for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
 
     Attributes:
         layer_matrix_name:
@@ -87,8 +86,7 @@ class MatrixOptions:
 
 @dataclass
 class SampleMetadataOptions:
-    """Optional arguments for the ``sample`` store
-    for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+    """Optional arguments for the ``sample`` store for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
 
     Attributes:
         skip_sample_tiledb:

diff --git a/src/cellarr/buildutils_cellarrdataset.py b/src/cellarr/buildutils_cellarrdataset.py
@@ -61,7 +61,6 @@
 from typing import List, Union
 
 import anndata
-import numpy as np
 import pandas as pd
 
 from . import utils_anndata as uad
@@ -96,12 +95,12 @@ def build_cellarrdataset(
     and package.
 
     There's a few assumptions this process makes:
-    - If object in ``files`` is an :py:class:`~anndata.AnnData` 
-    or H5AD object, these must contain an assay matrix in layer 
+    - If object in ``files`` is an :py:class:`~anndata.AnnData`
+    or H5AD object, these must contain an assay matrix in layer
     names as ``layer_matrix_name`` parameter.
     - Feature information must contain a column defined by
-    ``var_feature_column`` in the 
-    :py:class:`~cellarr.build_options.GeneAnnotationOptions.` that 
+    ``var_feature_column`` in the
+    :py:class:`~cellarr.build_options.GeneAnnotationOptions.` that
     contains feature ids or gene symbols across all files.
     - If no ``cell_metadata`` is provided, we scan to count the number of cells
     and create a simple range index.
@@ -192,18 +191,26 @@ def build_cellarrdataset(
         raise ValueError("'output_path' must be a directory.")
 
     if gene_metadata is None:
-        warnings.warn("Scanning all files for gene symbols, this may take long", UserWarning)
-        gene_set = uad.scan_for_features(files, var_gene_column=var_gene_column, num_threads=num_threads)
+        warnings.warn(
+            "Scanning all files for gene symbols, this may take long", UserWarning
+        )
+        gene_set = uad.scan_for_features(
+            files, var_gene_column=var_gene_column, num_threads=num_threads
+        )
 
         gene_set = sorted(gene_set)
 
         gene_metadata = pd.DataFrame({"cellarr_gene_index": gene_set}, index=gene_set)
     elif isinstance(gene_metadata, list):
         _gene_list = sorted(list(set(gene_metadata)))
-        gene_metadata = pd.DataFrame({"cellarr_gene_index": _gene_list}, index=_gene_list)
+        gene_metadata = pd.DataFrame(
+            {"cellarr_gene_index": _gene_list}, index=_gene_list
+        )
     elif isinstance(gene_metadata, dict):
         _gene_list = sorted(list(gene_metadata.keys()))
-        gene_metadata = pd.DataFrame({"cellarr_gene_index": _gene_list}, index=_gene_list)
+        gene_metadata = pd.DataFrame(
+            {"cellarr_gene_index": _gene_list}, index=_gene_list
+        )
     elif isinstance(gene_metadata, str):
         gene_metadata = pd.read_csv(gene_metadata, index=True, header=True)
         gene_metadata["cellarr_gene_index"] = gene_metadata.index.tolist()
@@ -226,7 +233,9 @@ def build_cellarrdataset(
             _col_types[col] = "ascii"
 
         _gene_output_uri = f"{output_path}/gene_metadata"
-        generate_metadata_tiledb_frame(_gene_output_uri, gene_metadata, column_types=_col_types)
+        generate_metadata_tiledb_frame(
+            _gene_output_uri, gene_metadata, column_types=_col_types
+        )
 
         if optimize_tiledb:
             uta.optimize_tiledb_array(_gene_output_uri)
@@ -242,7 +251,9 @@ def build_cellarrdataset(
         _cellindex_in_dataset.extend([x for x in range(cci)])
         _dataset.extend([f"dataset_{idx}" for _ in range(cci)])
 
-    _pseudo_cell_metadata = pd.DataFrame({"_cell_counts": _cellindex_in_dataset, "_cell_dataset_index": _dataset})
+    _pseudo_cell_metadata = pd.DataFrame(
+        {"_cell_counts": _cellindex_in_dataset, "_cell_dataset_index": _dataset}
+    )
 
     if num_cells is None:
         num_cells = sum(cell_counts)
@@ -268,23 +279,29 @@ def build_cellarrdataset(
             )
     elif isinstance(cell_metadata, pd.DataFrame):
         if num_cells != len(cell_metadata):
-            raise ValueError("Number of rows in 'cell_metadata' does not match the number of cells across files.")
+            raise ValueError(
+                "Number of rows in 'cell_metadata' does not match the number of cells across files."
+            )
 
     # Create the cell metadata tiledb
     if not skip_cell_tiledb:
         _cell_output_uri = f"{output_path}/cell_metadata"
 
         if isinstance(cell_metadata, str):
             _cell_metaframe = pd.read_csv(cell_metadata, chunksize=5, header=True)
-            generate_metadata_tiledb_csv(_cell_output_uri, cell_metadata, _cell_metaframe.columns)
+            generate_metadata_tiledb_csv(
+                _cell_output_uri, cell_metadata, _cell_metaframe.columns
+            )
         elif isinstance(cell_metadata, pd.DataFrame):
             _col_types = {}
             for col in gene_metadata.columns:
                 _col_types[col] = "ascii"
 
             _to_write = gene_metadata.astype(str)
 
-            generate_metadata_tiledb_frame(_cell_output_uri, _to_write, column_types=_col_types)
+            generate_metadata_tiledb_frame(
+                _cell_output_uri, _to_write, column_types=_col_types
+            )
 
         if optimize_tiledb:
             uta.optimize_tiledb_array(_cell_output_uri)
@@ -315,7 +332,9 @@ def build_cellarrdataset(
                 var_gene_column=var_gene_column,
                 layer_matrix_name=layer_matrix_name,
             )
-            uta.write_csr_matrix_to_tiledb(_counts_uri, matrix=mat, row_offset=offset, value_dtype=matrix_dim_dtype)
+            uta.write_csr_matrix_to_tiledb(
+                _counts_uri, matrix=mat, row_offset=offset, value_dtype=matrix_dim_dtype
+            )
             offset += int(mat.shape[0])
 
         if optimize_tiledb:
@@ -324,7 +343,9 @@ def build_cellarrdataset(
     return CellArrDataset(dataset_path=output_path, matrix_tdb_uri=layer_matrix_name)
 
 
-def generate_metadata_tiledb_frame(output_uri: str, input: pd.DataFrame, column_types: dict = None):
+def generate_metadata_tiledb_frame(
+    output_uri: str, input: pd.DataFrame, column_types: dict = None
+):
     """Generate metadata tiledb from a :pu:class:`~pandas.DataFrame`.
 
     Args:
@@ -341,7 +362,9 @@ def generate_metadata_tiledb_frame(output_uri: str, input: pd.DataFrame, column_
             Defaults to None.
     """
     _to_write = input.astype(str)
-    utf.create_tiledb_frame_from_dataframe(output_uri, _to_write, column_types=column_types)
+    utf.create_tiledb_frame_from_dataframe(
+        output_uri, _to_write, column_types=column_types
+    )
 
 
 def generate_metadata_tiledb_csv(
@@ -377,7 +400,9 @@ def generate_metadata_tiledb_csv(
 
     for chunk in pd.read_csv(input, chunksize=chunksize, header=True):
         if initfile:
-            utf.create_tiledb_frame_from_column_names(output_uri, chunk.columns, column_dtype)
+            utf.create_tiledb_frame_from_column_names(
+                output_uri, chunk.columns, column_dtype
+            )
             initfile = False
 
         _to_write = chunk.astype(str)

diff --git a/src/cellarr/utils_anndata.py b/src/cellarr/utils_anndata.py
@@ -17,8 +17,8 @@ def remap_anndata(
     var_feature_column: str = "index",
     layer_matrix_name: str = "counts",
 ) -> csr_matrix:
-    """Extract and remap the count matrix to the provided feature (gene) set 
-    order from the :py:class:`~anndata.AnnData` object.
+    """Extract and remap the count matrix to the provided feature (gene) set order from the :py:class:`~anndata.AnnData`
+    object.
 
     Args:
         adata:
@@ -30,9 +30,9 @@ def remap_anndata(
             for the columns in the matrix.
 
         feature_set_order:
-            A dictionary with the feature ids as keys and their index as 
-            value (e.g. gene symbols). The feature ids from the 
-            ``AnnData`` object are remapped to the feature order from 
+            A dictionary with the feature ids as keys and their index as
+            value (e.g. gene symbols). The feature ids from the
+            ``AnnData`` object are remapped to the feature order from
             this dictionary.
 
         var_feature_column: