sgkit-dev · tomwhite · Nov 5, 2024 · Oct 28, 2024 · tomwhite · Nov 4, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -41,3 +41,28 @@ jobs:
       uses: codecov/codecov-action@v3
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
+
+  test-zarr-version:
+    name: Test Zarr Python v3
+    # Scheduled runs only on the origin org
+    if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        zarr: ["==3.0.0b1"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt -r requirements-dev.txt
+      - name: Install zarr${{ matrix.zarr }}
+        run: |
+          python -m pip install --pre 'zarr${{ matrix.zarr }}'
+          python -m pip uninstall -y bio2zarr  # TODO: remove when bio2zarr supports Zarr Python 3
+      - name: Run tests
+        run: |
+          pytest
diff --git a/sgkit/io/bgen/bgen_reader.py b/sgkit/io/bgen/bgen_reader.py
@@ -18,6 +18,7 @@
 import dask
 import dask.array as da
 import dask.dataframe as dd
+import numcodecs
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -348,7 +349,7 @@ def encode_variables(
     ds: Dataset,
     chunk_length: int,
     chunk_width: int,
-    compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
+    compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
     probability_dtype: Optional[Any] = "uint8",
 ) -> Dict[Hashable, Dict[str, Any]]:
     encoding = {}
@@ -424,7 +425,7 @@ def rechunk_bgen(
     *,
     chunk_length: int = 10_000,
     chunk_width: int = 1_000,
-    compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
+    compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
     probability_dtype: Optional[DType] = "uint8",
     max_mem: str = "4GB",
     pack: bool = True,
@@ -538,7 +539,7 @@ def bgen_to_zarr(
     chunk_length: int = 10_000,
     chunk_width: int = 1_000,
     temp_chunk_length: int = 100,
-    compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
+    compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
     probability_dtype: Optional[DType] = "uint8",
     max_mem: str = "4GB",
     pack: bool = True,

diff --git a/sgkit/io/dataset.py b/sgkit/io/dataset.py
@@ -1,19 +1,19 @@
-from pathlib import Path
 from typing import Any, Dict, MutableMapping, Optional, Union
 
-import fsspec
 import numcodecs
 import xarray as xr
 from xarray import Dataset
 
 from sgkit.typing import PathType
+from sgkit.utils import has_keyword
 
 
 def save_dataset(
     ds: Dataset,
     store: Union[PathType, MutableMapping[str, bytes]],
     storage_options: Optional[Dict[str, str]] = None,
     auto_rechunk: Optional[bool] = None,
+    zarr_format: int = 2,
     **kwargs: Any,
 ) -> None:
     """Save a dataset to Zarr storage.
@@ -35,11 +35,6 @@ def save_dataset(
     kwargs
         Additional arguments to pass to :meth:`xarray.Dataset.to_zarr`.
     """
-    if isinstance(store, str):
-        storage_options = storage_options or {}
-        store = fsspec.get_mapper(store, **storage_options)
-    elif isinstance(store, Path):
-        store = str(store)
     if auto_rechunk is None:
         auto_rechunk = False
     for v in ds:
@@ -71,7 +66,9 @@ def save_dataset(
 
     # Catch unequal chunking errors to provide a more helpful error message
     try:
-        ds.to_zarr(store, **kwargs)
+        if has_keyword(ds.to_zarr, "zarr_format"):  # from xarray v2024.10.0
+            kwargs["zarr_format"] = zarr_format
+        ds.to_zarr(store, storage_options=storage_options, **kwargs)
     except ValueError as e:
         if "Zarr requires uniform chunk sizes" in str(
             e
@@ -109,12 +106,7 @@ def load_dataset(
     Dataset
         The dataset loaded from the Zarr store or file system.
     """
-    if isinstance(store, str):
-        storage_options = storage_options or {}
-        store = fsspec.get_mapper(store, **storage_options)
-    elif isinstance(store, Path):
-        store = str(store)
-    ds: Dataset = xr.open_zarr(store, concat_characters=False, **kwargs)  # type: ignore[no-untyped-call]
+    ds: Dataset = xr.open_zarr(store, storage_options=storage_options, concat_characters=False, **kwargs)  # type: ignore[no-untyped-call]
     for v in ds:
         # Workaround for https://github.com/pydata/xarray/issues/4386
         if v.endswith("_mask"):  # type: ignore

diff --git a/sgkit/tests/io/bgen/test_bgen_reader.py b/sgkit/tests/io/bgen/test_bgen_reader.py
@@ -5,6 +5,12 @@
 import numpy.testing as npt
 import pytest
 import xarray as xr
+import zarr
+from packaging.version import Version
+
+pytestmark = pytest.mark.skipif(
+    Version(zarr.__version__).major >= 3, reason="Rechunking fails for Zarr Python 3"
+)
 
 from sgkit.io.bgen.bgen_reader import (
     GT_DATA_VARS,

diff --git a/sgkit/tests/io/test_dataset.py b/sgkit/tests/io/test_dataset.py
@@ -2,6 +2,8 @@
 
 import pytest
 import xarray as xr
+import zarr
+from packaging.version import Version
 from xarray import Dataset
 
 from sgkit import load_dataset, save_dataset
@@ -54,7 +56,10 @@ def test_save_unequal_chunks_error():
         n_variant=10, n_sample=10, n_ploidy=10, n_allele=10, n_contig=10
     )
     # Normal zarr errors shouldn't be caught
-    with pytest.raises(ValueError, match="path '' contains an array"):
+    with pytest.raises(
+        (FileExistsError, ValueError),
+        match="(path '' contains an array|Store already exists)",
+    ):
         save_dataset(ds, {".zarray": ""})
 
     # Make the dataset have unequal chunk sizes across all dimensions
@@ -74,6 +79,9 @@ def test_save_unequal_chunks_error():
         save_dataset(ds, {})
 
 
+@pytest.mark.skipif(
+    Version(zarr.__version__).major >= 3, reason="Fails for Zarr Python 3"
+)
 def test_save_auto_rechunk():
     # Make all dimensions the same size for ease of testing
     ds = simulate_genotype_call_dataset(

diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
@@ -6,10 +6,14 @@
 import pandas as pd
 import pytest
 import xarray as xr
-import zarr
 from pandas import DataFrame
 from xarray import Dataset
 
+try:
+    from zarr.storage import ZipStore  # v3
+except ImportError:  # pragma: no cover
+    from zarr import ZipStore
+
 import sgkit.distarray as da
 from sgkit.stats.association import (
     gwas_linear_regression,
@@ -313,12 +317,10 @@ def test_regenie_loco_regression(ndarray_type: str, covariate: bool) -> None:
 
     for ds_name in datasets:
         # Load simulated data
-        genotypes_store = zarr.ZipStore(
+        genotypes_store = ZipStore(
             str(ds_dir / ds_name / "genotypes.zarr.zip"), mode="r"
         )
-        glow_store = zarr.ZipStore(
-            str(ds_dir / ds_name / glow_offsets_filename), mode="r"
-        )
+        glow_store = ZipStore(str(ds_dir / ds_name / glow_offsets_filename), mode="r")
 
         ds = xr.open_zarr(genotypes_store, consolidated=False)
         glow_loco_predictions = xr.open_zarr(glow_store, consolidated=False)

diff --git a/sgkit/tests/test_regenie.py b/sgkit/tests/test_regenie.py
@@ -9,7 +9,6 @@
 import pytest
 import xarray as xr
 import yaml
-import zarr
 from dask.array import Array
 from hypothesis import given, settings
 from hypothesis import strategies as st
@@ -18,6 +17,11 @@
 from pandas import DataFrame
 from xarray import Dataset
 
+try:
+    from zarr.storage import ZipStore  # v3
+except ImportError:  # pragma: no cover
+    from zarr import ZipStore
+
 from sgkit.stats.association import LinearRegressionResult, linear_regression
 from sgkit.stats.regenie import (
     index_array_blocks,
@@ -258,7 +262,7 @@ def check_simulation_result(
     result_dir = datadir / "result" / run["name"]
 
     # Load simulated data
-    with zarr.ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store:
+    with ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store:
         ds = xr.open_zarr(store, consolidated=False)
         df_covariate = load_covariates(dataset_dir)
         df_trait = load_traits(dataset_dir)

diff --git a/sgkit/utils.py b/sgkit/utils.py
@@ -1,3 +1,4 @@
+import inspect
 import warnings
 from itertools import product
 from typing import Any, Callable, Hashable, List, Mapping, Optional, Set, Tuple, Union
@@ -425,3 +426,10 @@ def smallest_numpy_int_dtype(value: int) -> Optional[DType]:
         if np.iinfo(dtype).min <= value <= np.iinfo(dtype).max:
             return dtype
     raise OverflowError(f"Value {value} cannot be stored in np.int64")
+
+
+def has_keyword(func, keyword):
+    try:
+        return keyword in inspect.signature(func).parameters
+    except Exception:  # pragma: no cover
+        return False