Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run tests using Zarr Python v3 #1277

Merged
merged 1 commit into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,28 @@ jobs:
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}

test-zarr-version:
name: Test Zarr Python v3
# Scheduled runs only on the origin org
if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
runs-on: ubuntu-latest
strategy:
matrix:
zarr: ["==3.0.0b1"]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt -r requirements-dev.txt
- name: Install zarr${{ matrix.zarr }}
run: |
python -m pip install --pre 'zarr${{ matrix.zarr }}'
python -m pip uninstall -y bio2zarr # TODO: remove when bio2zarr supports Zarr Python 3
- name: Run tests
run: |
pytest
7 changes: 4 additions & 3 deletions sgkit/io/bgen/bgen_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import dask
import dask.array as da
import dask.dataframe as dd
import numcodecs
import numpy as np
import pandas as pd
import xarray as xr
Expand Down Expand Up @@ -348,7 +349,7 @@ def encode_variables(
ds: Dataset,
chunk_length: int,
chunk_width: int,
compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
probability_dtype: Optional[Any] = "uint8",
) -> Dict[Hashable, Dict[str, Any]]:
encoding = {}
Expand Down Expand Up @@ -424,7 +425,7 @@ def rechunk_bgen(
*,
chunk_length: int = 10_000,
chunk_width: int = 1_000,
compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
probability_dtype: Optional[DType] = "uint8",
max_mem: str = "4GB",
pack: bool = True,
Expand Down Expand Up @@ -538,7 +539,7 @@ def bgen_to_zarr(
chunk_length: int = 10_000,
chunk_width: int = 1_000,
temp_chunk_length: int = 100,
compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
probability_dtype: Optional[DType] = "uint8",
max_mem: str = "4GB",
pack: bool = True,
Expand Down
20 changes: 6 additions & 14 deletions sgkit/io/dataset.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from pathlib import Path
from typing import Any, Dict, MutableMapping, Optional, Union

import fsspec
import numcodecs
import xarray as xr
from xarray import Dataset

from sgkit.typing import PathType
from sgkit.utils import has_keyword


def save_dataset(
ds: Dataset,
store: Union[PathType, MutableMapping[str, bytes]],
storage_options: Optional[Dict[str, str]] = None,
auto_rechunk: Optional[bool] = None,
zarr_format: int = 2,
**kwargs: Any,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that we're still using version 2 of the Zarr file format.

) -> None:
"""Save a dataset to Zarr storage.
Expand All @@ -35,11 +35,6 @@ def save_dataset(
kwargs
Additional arguments to pass to :meth:`xarray.Dataset.to_zarr`.
"""
if isinstance(store, str):
storage_options = storage_options or {}
store = fsspec.get_mapper(store, **storage_options)
elif isinstance(store, Path):
store = str(store)
if auto_rechunk is None:
auto_rechunk = False
for v in ds:
Expand Down Expand Up @@ -71,7 +66,9 @@ def save_dataset(

# Catch unequal chunking errors to provide a more helpful error message
try:
ds.to_zarr(store, **kwargs)
if has_keyword(ds.to_zarr, "zarr_format"): # from xarray v2024.10.0
kwargs["zarr_format"] = zarr_format
ds.to_zarr(store, storage_options=storage_options, **kwargs)
except ValueError as e:
if "Zarr requires uniform chunk sizes" in str(
e
Expand Down Expand Up @@ -109,12 +106,7 @@ def load_dataset(
Dataset
The dataset loaded from the Zarr store or file system.
"""
if isinstance(store, str):
storage_options = storage_options or {}
store = fsspec.get_mapper(store, **storage_options)
elif isinstance(store, Path):
store = str(store)
ds: Dataset = xr.open_zarr(store, concat_characters=False, **kwargs) # type: ignore[no-untyped-call]
ds: Dataset = xr.open_zarr(store, storage_options=storage_options, concat_characters=False, **kwargs) # type: ignore[no-untyped-call]
for v in ds:
# Workaround for https://github.com/pydata/xarray/issues/4386
if v.endswith("_mask"): # type: ignore
Expand Down
6 changes: 6 additions & 0 deletions sgkit/tests/io/bgen/test_bgen_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
import numpy.testing as npt
import pytest
import xarray as xr
import zarr
from packaging.version import Version

pytestmark = pytest.mark.skipif(
Version(zarr.__version__).major >= 3, reason="Rechunking fails for Zarr Python 3"
)

from sgkit.io.bgen.bgen_reader import (
GT_DATA_VARS,
Expand Down
10 changes: 9 additions & 1 deletion sgkit/tests/io/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import pytest
import xarray as xr
import zarr
from packaging.version import Version
from xarray import Dataset

from sgkit import load_dataset, save_dataset
Expand Down Expand Up @@ -54,7 +56,10 @@ def test_save_unequal_chunks_error():
n_variant=10, n_sample=10, n_ploidy=10, n_allele=10, n_contig=10
)
# Normal zarr errors shouldn't be caught
with pytest.raises(ValueError, match="path '' contains an array"):
with pytest.raises(
(FileExistsError, ValueError),
match="(path '' contains an array|Store already exists)",
):
save_dataset(ds, {".zarray": ""})

# Make the dataset have unequal chunk sizes across all dimensions
Expand All @@ -74,6 +79,9 @@ def test_save_unequal_chunks_error():
save_dataset(ds, {})


@pytest.mark.skipif(
Version(zarr.__version__).major >= 3, reason="Fails for Zarr Python 3"
)
def test_save_auto_rechunk():
# Make all dimensions the same size for ease of testing
ds = simulate_genotype_call_dataset(
Expand Down
12 changes: 7 additions & 5 deletions sgkit/tests/test_association.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@
import pandas as pd
import pytest
import xarray as xr
import zarr
from pandas import DataFrame
from xarray import Dataset

try:
from zarr.storage import ZipStore # v3
except ImportError: # pragma: no cover
from zarr import ZipStore

import sgkit.distarray as da
from sgkit.stats.association import (
gwas_linear_regression,
Expand Down Expand Up @@ -313,12 +317,10 @@ def test_regenie_loco_regression(ndarray_type: str, covariate: bool) -> None:

for ds_name in datasets:
# Load simulated data
genotypes_store = zarr.ZipStore(
genotypes_store = ZipStore(
str(ds_dir / ds_name / "genotypes.zarr.zip"), mode="r"
)
glow_store = zarr.ZipStore(
str(ds_dir / ds_name / glow_offsets_filename), mode="r"
)
glow_store = ZipStore(str(ds_dir / ds_name / glow_offsets_filename), mode="r")

ds = xr.open_zarr(genotypes_store, consolidated=False)
glow_loco_predictions = xr.open_zarr(glow_store, consolidated=False)
Expand Down
8 changes: 6 additions & 2 deletions sgkit/tests/test_regenie.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pytest
import xarray as xr
import yaml
import zarr
from dask.array import Array
from hypothesis import given, settings
from hypothesis import strategies as st
Expand All @@ -18,6 +17,11 @@
from pandas import DataFrame
from xarray import Dataset

try:
from zarr.storage import ZipStore # v3
except ImportError: # pragma: no cover
from zarr import ZipStore

from sgkit.stats.association import LinearRegressionResult, linear_regression
from sgkit.stats.regenie import (
index_array_blocks,
Expand Down Expand Up @@ -258,7 +262,7 @@ def check_simulation_result(
result_dir = datadir / "result" / run["name"]

# Load simulated data
with zarr.ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store:
with ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store:
ds = xr.open_zarr(store, consolidated=False)
df_covariate = load_covariates(dataset_dir)
df_trait = load_traits(dataset_dir)
Expand Down
8 changes: 8 additions & 0 deletions sgkit/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import inspect
import warnings
from itertools import product
from typing import Any, Callable, Hashable, List, Mapping, Optional, Set, Tuple, Union
Expand Down Expand Up @@ -425,3 +426,10 @@ def smallest_numpy_int_dtype(value: int) -> Optional[DType]:
if np.iinfo(dtype).min <= value <= np.iinfo(dtype).max:
return dtype
raise OverflowError(f"Value {value} cannot be stored in np.int64")


def has_keyword(func, keyword):
try:
return keyword in inspect.signature(func).parameters
except Exception: # pragma: no cover
return False
Loading