Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LyNSeC data #396

Merged
merged 2 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions scripts/datasets/histopathology/check_lynsec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
import sys

from torch_em.util.debug import check_loader
from torch_em.data.datasets import get_lynsec_loader

sys.path.append("..")


def check_lynsec():
from util import ROOT

loader = get_lynsec_loader(
path=os.path.join(ROOT, "lynsec"),
batch_size=1,
patch_shape=(512, 512),
choice="h&e",
download=True,
)

check_loader(loader, 8, instance_labels=True)


if __name__ == "__main__":
check_lynsec()
1 change: 1 addition & 0 deletions torch_em/data/datasets/histopathology/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .bcss import get_bcss_loader, get_bcss_dataset
from .cryonuseg import get_cryonuseg_loader, get_cryonuseg_dataset
from .lizard import get_lizard_loader, get_lizard_dataset
from .lynsec import get_lynsec_loader, get_lynsec_dataset
from .monuseg import get_monuseg_loader, get_monuseg_dataset
from .monusac import get_monusac_loader, get_monusac_dataset
from .pannuke import get_pannuke_loader, get_pannuke_dataset
163 changes: 163 additions & 0 deletions torch_em/data/datasets/histopathology/lynsec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""The LyNSeC dataset contains annotations for nucleus segmentation
in IHC and H&E stained lymphoma tissue images.

The dataset is located at https://doi.org/10.5281/zenodo.8065174.
This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2024.107978.
Please cite it if you use this dataset in your research.
"""

import os
from glob import glob
from tqdm import tqdm
from pathlib import Path
from natsort import natsorted
from typing import Union, Tuple, List, Optional, Literal

import numpy as np
import imageio.v3 as imageio

import torch_em

from torch.utils.data import Dataset, DataLoader

from .. import util


URL = "https://zenodo.org/records/8065174/files/lynsec.zip"
CHECKSUM = "14b9b5a9c39cb41afc7f31de5a995cefff0947c215e14ab9c7a463f32fbbf4b6"


def _preprocess_dataset(data_dir):
data_dirs = natsorted(glob(os.path.join(data_dir, "lynsec*")))
for _dir in data_dirs:
if os.path.basename(_dir) == "lynsec 1":
target_dir = "ihc"
else:
target_dir = "h&e"

image_dir = os.path.join(data_dir, target_dir, "images")
label_dir = os.path.join(data_dir, target_dir, "labels")
os.makedirs(image_dir, exist_ok=True)
os.makedirs(label_dir, exist_ok=True)

paths = natsorted(glob(os.path.join(_dir, "*.npy")))
for fpath in tqdm(paths, desc="Preprocessing inputs"):
fname = Path(fpath).stem
darray = np.load(fpath)

raw = darray[..., :3]
labels = darray[..., 3]

if target_dir == "h&e" and fname in [f"{i}_l2" for i in range(35)]: # set of images have mismatching labels
continue

imageio.imwrite(os.path.join(image_dir, f"{fname}.tif"), raw, compression="zlib")
imageio.imwrite(os.path.join(label_dir, f"{fname}.tif"), labels, compression="zlib")


def get_lynsec_data(path: Union[os.PathLike, str], download: bool = False) -> str:
"""Download the LyNSeC dataset for nucleus segmentation.

Args:
path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:
The filepath to the downloaded data.
"""
data_dir = os.path.join(path, "data")
if os.path.exists(data_dir):
return data_dir

os.makedirs(data_dir, exist_ok=True)

zip_path = os.path.join(path, "lynsec.zip")
util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
util.unzip(zip_path=zip_path, dst=data_dir)

_preprocess_dataset(data_dir)

return data_dir


def get_lynsec_paths(
path: Union[os.PathLike, str], choice: Optional[Literal['ihc', 'h&e']] = None, download: bool = False
) -> Tuple[List[str], List[str]]:
"""Get paths to the LyNSec data.

Args:
path: Filepath to a folder where the downloaded data will be saved.
choice: The choice of dataset.
download: Whether to download the data if it is not present.

Returns:
List of filepaths to the image data.
List of filepaths to the label data.
"""
data_dir = get_lynsec_data(path, download)

if choice is None:
choice = "*"

raw_paths = natsorted(glob(os.path.join(data_dir, choice, "images", "*.tif")))
label_paths = natsorted(glob(os.path.join(data_dir, choice, "labels", "*.tif")))

return raw_paths, label_paths


def get_lynsec_dataset(
path: Union[os.PathLike, str],
patch_shape: Tuple[int, int],
choice: Optional[Literal['ihc', 'h&e']] = None,
download: bool = False,
**kwargs
) -> Dataset:
"""Get the LyNSeC dataset for nucleus segmentation.

Args:
path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
choice: The choice of dataset.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.

Returns:
The segmentation dataset.
"""
raw_paths, label_paths = get_lynsec_paths(path, choice, download)

return torch_em.default_segmentation_dataset(
raw_paths=raw_paths,
raw_key=None,
label_paths=label_paths,
label_key=None,
patch_shape=patch_shape,
is_seg_dataset=False,
**kwargs
)


def get_lynsec_loader(
path: Union[os.PathLike, str],
batch_size: int,
patch_shape: Tuple[int, int],
choice: Optional[Literal['ihc', 'h&e']] = None,
download: bool = False,
**kwargs
) -> DataLoader:
"""Get the LyNSeC dataloader for nucleus segmentation.

Args:
path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
choice: The choice of dataset.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.

Returns:
The DataLoader.
"""
ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
dataset = get_lynsec_dataset(path, patch_shape, choice, download, **ds_kwargs)
return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)