Skip to content

Commit

Permalink
Merge pull request #124 from gustaveroussy/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
quentinblampey authored Sep 17, 2024
2 parents f152ab1 + 29ca914 commit 92412be
Show file tree
Hide file tree
Showing 10 changed files with 33 additions and 61 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
## [1.x.x] - 2024-xx-xx
## [1.1.5] - 2024-09-17

### Fix
- Accept `object` dtype for channel names (#114)

### Changed
- Update MACSima reader to read the channel names of the latest file format

## [1.1.4] - 2024-08-21

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sopa"
version = "1.1.4"
version = "1.1.5"
description = "Spatial-omics pipeline and analysis"
documentation = "https://gustaveroussy.github.io/sopa"
homepage = "https://gustaveroussy.github.io/sopa"
Expand Down
4 changes: 2 additions & 2 deletions sopa/io/reader/cosmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from spatialdata.models import Image2DModel, PointsModel
from spatialdata_io._constants._constants import CosmxKeys

from .utils import _deduplicate_c_coords, _default_image_kwargs
from .utils import _deduplicate_names, _default_image_kwargs

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -138,7 +138,7 @@ def _read_fov_image(
protein_image, protein_names = _read_protein_fov(protein_path)
image = da.concatenate([image, protein_image], axis=0)

return image, _deduplicate_c_coords(morphology_coords + protein_names)
return image, _deduplicate_names(morphology_coords + protein_names)


def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
Expand Down
28 changes: 4 additions & 24 deletions sopa/io/reader/macsima.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from __future__ import annotations

import logging
import re
from pathlib import Path

import pandas as pd
from spatialdata import SpatialData

from .utils import _deduplicate_names, _general_tif_directory_reader
from .utils import _general_tif_directory_reader

log = logging.getLogger(__name__)

Expand All @@ -16,31 +14,13 @@ def macsima(path: Path, **kwargs: int) -> SpatialData:
"""Read MACSIMA data as a `SpatialData` object
Notes:
For all dulicated name, their index will be added in brackets after, for instance you will often find `DAPI (000)` to indicate the DAPI channel of index `000`
For all dulicated name, their index will be added in brackets after, for instance you may find `DAPI (1)`.
Args:
path: Path to the directory containing the MACSIMA `.tif` images
kwargs: Kwargs for `_general_tif_directory_reader`
kwargs: Kwargs for the `_general_tif_directory_reader`
Returns:
A `SpatialData` object with a 2D-image of shape `(C, Y, X)`
"""
return _general_tif_directory_reader(path, files_to_channels=_get_channel_names_macsima, **kwargs)


def _parse_name_macsima(file):
index = file.name[2:5] if file.name[0] == "C" else file.name[:3]
match = re.search(r"_A-(.*?)_C-", file.name)
if match:
antibody = match.group(1)
channel = re.search(r"_C-(.*?)\.tif", file.name).group(1)
uid = f"{channel}-{index}"
else:
antibody = re.search(r"_A-(.*?)\.tif", file.name).group(1)
uid = index
return [antibody, uid]


def _get_channel_names_macsima(files):
df_antibodies = pd.DataFrame([_parse_name_macsima(file) for file in files])
return _deduplicate_names(df_antibodies)
return _general_tif_directory_reader(path, **kwargs)
10 changes: 1 addition & 9 deletions sopa/io/reader/phenocycler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pathlib import Path

import dask.array as da
import pandas as pd
import tifffile as tf
from dask.delayed import delayed
from dask_image.imread import imread
Expand Down Expand Up @@ -39,7 +38,7 @@ def phenocycler(
if path.suffix == ".qptiff":
with tf.TiffFile(path) as tif:
series = tif.series[0]
names = _get_channel_names_qptiff(series)
names = _deduplicate_names([_get_channel_name_qptiff(page.description) for page in series])

delayed_image = delayed(lambda series: series.asarray())(tif)
image = da.from_delayed(delayed_image, dtype=series.dtype, shape=series.shape)
Expand Down Expand Up @@ -76,13 +75,6 @@ def _get_channel_name_qptiff(description):
return re.search(r"<Name>(.*?)</Name>", description).group(1)


def _get_channel_names_qptiff(page_series):
df_names = pd.DataFrame(
[[_get_channel_name_qptiff(page.description), str(i)] for i, page in enumerate(page_series)]
)
return _deduplicate_names(df_names)


def _get_IJ_channel_names(path: str) -> list[str]:
with tf.TiffFile(path) as tif:
default_names = [str(i) for i in range(len(tif.pages))]
Expand Down
28 changes: 11 additions & 17 deletions sopa/io/reader/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from __future__ import annotations

import logging
from collections import defaultdict
from pathlib import Path
from typing import Callable

import dask.array as da
import numpy as np
import pandas as pd
import tifffile as tf
import xarray as xr
from dask_image.imread import imread
Expand All @@ -33,30 +33,24 @@ def _default_image_kwargs(
return image_models_kwargs, imread_kwargs


def _deduplicate_names(df):
is_duplicated = df[0].duplicated(keep=False)
df.loc[is_duplicated, 0] += " (" + df.loc[is_duplicated, 1] + ")"
return df[0].values
def _deduplicate_names(names: pd.Series | np.ndarray | list[str]) -> np.ndarray:
if not isinstance(names, pd.Series):
names = pd.Series(names)
names = names.astype(str)

duplicates = names.duplicated()
names[duplicates] += " (" + names.groupby(by=names).cumcount().astype(str)[duplicates] + ")"

def _deduplicate_c_coords(c_coords: list[str]) -> list[str]:
counter, res = defaultdict(int), []
for channel in c_coords:
if channel not in counter:
res.append(channel)
else:
res.append(f"{channel} ({counter[channel]})")
counter[channel] += 1
return res
return names.values


def _get_files_stem(files: list[Path]):
return [file.stem for file in files]
def _get_ome_channel_names(files):
return _deduplicate_names([_ome_channels_names(file)[0] for file in files])


def _general_tif_directory_reader(
path: str,
files_to_channels: Callable = _get_files_stem,
files_to_channels: Callable = _get_ome_channel_names,
suffix: str = ".tif",
image_models_kwargs: dict | None = None,
imread_kwargs: dict | None = None,
Expand Down
4 changes: 2 additions & 2 deletions sopa/io/standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .._constants import VALID_DIMENSIONS, SopaKeys
from .._sdata import get_spatial_image
from ..utils import _check_integer_dtype, get_channel_names, is_string_dtype
from ..utils import _check_integer_dtype, get_channel_names, valid_c_coords

log = logging.getLogger(__name__)

Expand All @@ -28,7 +28,7 @@ def sanity_check(sdata: SpatialData, delete_table: bool = False, warn: bool = Fa
)

c_coords = get_channel_names(image)
assert is_string_dtype(c_coords), f"Channel names must be strings, not {c_coords.dtype}"
assert valid_c_coords(c_coords), f"Channel names must be strings, not {c_coords.dtype}"

if SopaKeys.TABLE in sdata.tables:
if delete_table:
Expand Down
2 changes: 1 addition & 1 deletion sopa/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
get_channel_names,
scale_dtype,
string_channel_names,
is_string_dtype,
valid_c_coords,
_check_integer_dtype,
)
6 changes: 3 additions & 3 deletions sopa/utils/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ def get_channel_names(image: DataArray | DataTree) -> np.ndarray:
raise ValueError(f"Image must be a DataTree or a DataArray. Found: {type(image)}")


def is_string_dtype(c_coords: np.ndarray) -> bool:
return c_coords.dtype.kind in {"U", "S"}
def valid_c_coords(c_coords: np.ndarray) -> bool:
return c_coords.dtype.kind in {"U", "S", "O"}


def string_channel_names(sdata: SpatialData, default_single_channel: str = "DAPI"):
for key, image in list(sdata.images.items()):
c_coords = get_channel_names(image)

if is_string_dtype(c_coords):
if valid_c_coords(c_coords):
continue

c_coords = [str(i) for i in range(len(c_coords))]
Expand Down
2 changes: 1 addition & 1 deletion workflow/config/macsima/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ patchify:
segmentation:
cellpose:
diameter: 35
channels: ["DAPI (000)"]
channels: ["DAPI"]
flow_threshold: 2
cellprob_threshold: -6
min_area: 400
Expand Down

0 comments on commit 92412be

Please sign in to comment.