Merge pull request #124 from gustaveroussy/dev

Dev
gustaveroussy · Sep 17, 2024 · 92412be · 92412be
2 parents f152ab1 + 29ca914
commit 92412be
Show file tree

Hide file tree

Showing 10 changed files with 33 additions and 61 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,10 @@
-## [1.x.x] - 2024-xx-xx
+## [1.1.5] - 2024-09-17
+
+### Fix
+- Accept `object` dtype for channel names (#114)
+
+### Changed
+- Update MACSima reader to read the channel names of the latest file format
 
 ## [1.1.4] - 2024-08-21
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sopa"
-version = "1.1.4"
+version = "1.1.5"
 description = "Spatial-omics pipeline and analysis"
 documentation = "https://gustaveroussy.github.io/sopa"
 homepage = "https://gustaveroussy.github.io/sopa"

diff --git a/sopa/io/reader/cosmx.py b/sopa/io/reader/cosmx.py
@@ -15,7 +15,7 @@
 from spatialdata.models import Image2DModel, PointsModel
 from spatialdata_io._constants._constants import CosmxKeys
 
-from .utils import _deduplicate_c_coords, _default_image_kwargs
+from .utils import _deduplicate_names, _default_image_kwargs
 
 log = logging.getLogger(__name__)
 
@@ -138,7 +138,7 @@ def _read_fov_image(
         protein_image, protein_names = _read_protein_fov(protein_path)
         image = da.concatenate([image, protein_image], axis=0)
 
-    return image, _deduplicate_c_coords(morphology_coords + protein_names)
+    return image, _deduplicate_names(morphology_coords + protein_names)
 
 
 def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:

diff --git a/sopa/io/reader/macsima.py b/sopa/io/reader/macsima.py
@@ -1,13 +1,11 @@
 from __future__ import annotations
 
 import logging
-import re
 from pathlib import Path
 
-import pandas as pd
 from spatialdata import SpatialData
 
-from .utils import _deduplicate_names, _general_tif_directory_reader
+from .utils import _general_tif_directory_reader
 
 log = logging.getLogger(__name__)
 
@@ -16,31 +14,13 @@ def macsima(path: Path, **kwargs: int) -> SpatialData:
     """Read MACSIMA data as a `SpatialData` object
 
     Notes:
-        For all dulicated name, their index will be added in brackets after, for instance you will often find `DAPI (000)` to indicate the DAPI channel of index `000`
+        For all dulicated name, their index will be added in brackets after, for instance you may find `DAPI (1)`.
 
     Args:
         path: Path to the directory containing the MACSIMA `.tif` images
-        kwargs: Kwargs for `_general_tif_directory_reader`
+        kwargs: Kwargs for the `_general_tif_directory_reader`
 
     Returns:
         A `SpatialData` object with a 2D-image of shape `(C, Y, X)`
     """
-    return _general_tif_directory_reader(path, files_to_channels=_get_channel_names_macsima, **kwargs)
-
-
-def _parse_name_macsima(file):
-    index = file.name[2:5] if file.name[0] == "C" else file.name[:3]
-    match = re.search(r"_A-(.*?)_C-", file.name)
-    if match:
-        antibody = match.group(1)
-        channel = re.search(r"_C-(.*?)\.tif", file.name).group(1)
-        uid = f"{channel}-{index}"
-    else:
-        antibody = re.search(r"_A-(.*?)\.tif", file.name).group(1)
-        uid = index
-    return [antibody, uid]
-
-
-def _get_channel_names_macsima(files):
-    df_antibodies = pd.DataFrame([_parse_name_macsima(file) for file in files])
-    return _deduplicate_names(df_antibodies)
+    return _general_tif_directory_reader(path, **kwargs)
diff --git a/sopa/io/reader/phenocycler.py b/sopa/io/reader/phenocycler.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 
 import dask.array as da
-import pandas as pd
 import tifffile as tf
 from dask.delayed import delayed
 from dask_image.imread import imread
@@ -39,7 +38,7 @@ def phenocycler(
     if path.suffix == ".qptiff":
         with tf.TiffFile(path) as tif:
             series = tif.series[0]
-            names = _get_channel_names_qptiff(series)
+            names = _deduplicate_names([_get_channel_name_qptiff(page.description) for page in series])
 
             delayed_image = delayed(lambda series: series.asarray())(tif)
             image = da.from_delayed(delayed_image, dtype=series.dtype, shape=series.shape)
@@ -76,13 +75,6 @@ def _get_channel_name_qptiff(description):
     return re.search(r"<Name>(.*?)</Name>", description).group(1)
 
 
-def _get_channel_names_qptiff(page_series):
-    df_names = pd.DataFrame(
-        [[_get_channel_name_qptiff(page.description), str(i)] for i, page in enumerate(page_series)]
-    )
-    return _deduplicate_names(df_names)
-
-
 def _get_IJ_channel_names(path: str) -> list[str]:
     with tf.TiffFile(path) as tif:
         default_names = [str(i) for i in range(len(tif.pages))]

diff --git a/sopa/io/reader/utils.py b/sopa/io/reader/utils.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 
 import logging
-from collections import defaultdict
 from pathlib import Path
 from typing import Callable
 
 import dask.array as da
 import numpy as np
+import pandas as pd
 import tifffile as tf
 import xarray as xr
 from dask_image.imread import imread
@@ -33,30 +33,24 @@ def _default_image_kwargs(
     return image_models_kwargs, imread_kwargs
 
 
-def _deduplicate_names(df):
-    is_duplicated = df[0].duplicated(keep=False)
-    df.loc[is_duplicated, 0] += " (" + df.loc[is_duplicated, 1] + ")"
-    return df[0].values
+def _deduplicate_names(names: pd.Series | np.ndarray | list[str]) -> np.ndarray:
+    if not isinstance(names, pd.Series):
+        names = pd.Series(names)
+    names = names.astype(str)
 
+    duplicates = names.duplicated()
+    names[duplicates] += " (" + names.groupby(by=names).cumcount().astype(str)[duplicates] + ")"
 
-def _deduplicate_c_coords(c_coords: list[str]) -> list[str]:
-    counter, res = defaultdict(int), []
-    for channel in c_coords:
-        if channel not in counter:
-            res.append(channel)
-        else:
-            res.append(f"{channel} ({counter[channel]})")
-        counter[channel] += 1
-    return res
+    return names.values
 
 
-def _get_files_stem(files: list[Path]):
-    return [file.stem for file in files]
+def _get_ome_channel_names(files):
+    return _deduplicate_names([_ome_channels_names(file)[0] for file in files])
 
 
 def _general_tif_directory_reader(
     path: str,
-    files_to_channels: Callable = _get_files_stem,
+    files_to_channels: Callable = _get_ome_channel_names,
     suffix: str = ".tif",
     image_models_kwargs: dict | None = None,
     imread_kwargs: dict | None = None,

diff --git a/sopa/io/standardize.py b/sopa/io/standardize.py
@@ -8,7 +8,7 @@
 
 from .._constants import VALID_DIMENSIONS, SopaKeys
 from .._sdata import get_spatial_image
-from ..utils import _check_integer_dtype, get_channel_names, is_string_dtype
+from ..utils import _check_integer_dtype, get_channel_names, valid_c_coords
 
 log = logging.getLogger(__name__)
 
@@ -28,7 +28,7 @@ def sanity_check(sdata: SpatialData, delete_table: bool = False, warn: bool = Fa
         )
 
     c_coords = get_channel_names(image)
-    assert is_string_dtype(c_coords), f"Channel names must be strings, not {c_coords.dtype}"
+    assert valid_c_coords(c_coords), f"Channel names must be strings, not {c_coords.dtype}"
 
     if SopaKeys.TABLE in sdata.tables:
         if delete_table:

diff --git a/sopa/utils/__init__.py b/sopa/utils/__init__.py
@@ -2,6 +2,6 @@
     get_channel_names,
     scale_dtype,
     string_channel_names,
-    is_string_dtype,
+    valid_c_coords,
     _check_integer_dtype,
 )
diff --git a/sopa/utils/image.py b/sopa/utils/image.py
@@ -78,15 +78,15 @@ def get_channel_names(image: DataArray | DataTree) -> np.ndarray:
     raise ValueError(f"Image must be a DataTree or a DataArray. Found: {type(image)}")
 
 
-def is_string_dtype(c_coords: np.ndarray) -> bool:
-    return c_coords.dtype.kind in {"U", "S"}
+def valid_c_coords(c_coords: np.ndarray) -> bool:
+    return c_coords.dtype.kind in {"U", "S", "O"}
 
 
 def string_channel_names(sdata: SpatialData, default_single_channel: str = "DAPI"):
     for key, image in list(sdata.images.items()):
         c_coords = get_channel_names(image)
 
-        if is_string_dtype(c_coords):
+        if valid_c_coords(c_coords):
             continue
 
         c_coords = [str(i) for i in range(len(c_coords))]

diff --git a/workflow/config/macsima/base.yaml b/workflow/config/macsima/base.yaml
@@ -9,7 +9,7 @@ patchify:
 segmentation:
   cellpose:
     diameter: 35
-    channels: ["DAPI (000)"]
+    channels: ["DAPI"]
     flow_threshold: 2
     cellprob_threshold: -6
     min_area: 400