Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write manifests to zarr store #45

Merged
merged 18 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,25 @@ combined_ds = xr.open_dataset(mapper, engine="kerchunk")

### Writing as Zarr

TODO: Write out references as a Zarr v3 store following the [Chunk Manifest ZEP](https://github.com/zarr-developers/zarr-specs/issues/287), see [PR #45](https://github.com/TomNicholas/VirtualiZarr/pull/45)
Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr <virtualizarr.xarray.VirtualiZarrDatasetAccessor.to_zarr>` accessor method.

TODO: Explanation of how this requires changes in zarr upstream to be able to read it
```python
combined_vds.virtualize.to_zarr('combined.zarr')
```

The result is a zarr v3 store on disk which contains the chunk manifest information written out as `manifest.json` files, so the store looks like this:

```
combined/zarr.json <- group metadata
combined/air/zarr.json <- array metadata
combined/air/manifest.json <- array manifest
...
```

The advantage of this format is that any zarr v3 reader that understands the chunk manifest ZEP could read from this store, no matter what language it is written in (e.g. via `zarr-python`, `zarr-js`, or rust). This reading would also not require `fsspec`.

```{note}
Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this feature cannot be used for data processing.

This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`.
```
14 changes: 10 additions & 4 deletions virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import json
import re
from typing import Any, Iterable, Iterator, List, Mapping, Tuple, Union, cast

Expand Down Expand Up @@ -110,14 +111,19 @@ def dict(self) -> dict[str, dict[str, Union[str, int]]]:
"""Converts the entire manifest to a nested dictionary."""
return {k: dict(entry) for k, entry in self.entries.items()}

@staticmethod
def from_zarr_json(filepath: str) -> "ChunkManifest":
@classmethod
def from_zarr_json(cls, filepath: str) -> "ChunkManifest":
"""Create a ChunkManifest from a Zarr manifest.json file."""
raise NotImplementedError()
with open(filepath, "r") as manifest_file:
entries_dict = json.load(manifest_file)

entries = {cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items()}
return cls(entries=entries)

def to_zarr_json(self, filepath: str) -> None:
"""Write a ChunkManifest to a Zarr manifest.json file."""
raise NotImplementedError()
with open(filepath, "w") as json_file:
json.dump(self.dict(), json_file, indent=4, separators=(", ", ": "))

@classmethod
def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
Expand Down
27 changes: 27 additions & 0 deletions virtualizarr/tests/test_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import xarray as xr
import numpy as np
import xarray.testing as xrt
from virtualizarr import open_virtual_dataset, ManifestArray
from virtualizarr.manifests.manifest import ChunkEntry


def test_zarr_v3_roundtrip(tmpdir):
arr = ManifestArray(
chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)},
zarray=dict(
shape=(2, 3),
dtype=np.dtype("<i8"),
chunks=(2, 3),
compressor=None,
filters=None,
fill_value=None,
order="C",
zarr_format=3,
),
)
original = xr.Dataset({"a": (["x", "y"], arr)})

original.virtualize.to_zarr(tmpdir / "store.zarr")
roundtrip = open_virtual_dataset(tmpdir / "store.zarr", filetype="zarr_v3", indexes={})

xrt.assert_identical(roundtrip, original)
Empty file added virtualizarr/vendor/__init__.py
Empty file.
21 changes: 21 additions & 0 deletions virtualizarr/vendor/zarr/LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2015-2024 Zarr Developers <https://github.com/zarr-developers>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Empty file.
22 changes: 22 additions & 0 deletions virtualizarr/vendor/zarr/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import json
import numbers

from typing import Any


class NumberEncoder(json.JSONEncoder):
def default(self, o):
# See json.JSONEncoder.default docstring for explanation
# This is necessary to encode numpy dtype
if isinstance(o, numbers.Integral):
return int(o)
if isinstance(o, numbers.Real):
return float(o)
return json.JSONEncoder.default(self, o)

Check warning on line 15 in virtualizarr/vendor/zarr/utils.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/zarr/utils.py#L11-L15

Added lines #L11 - L15 were not covered by tests


def json_dumps(o: Any) -> bytes:
Copy link
Member Author

@TomNicholas TomNicholas Mar 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I chose to vendor this because I didn't want to import internals of the zarr-python library while it's in flux, and also this helps make it clear exactly which parts of this package even need zarr-python at all.

"""Write JSON in a consistent, human-readable way."""
return json.dumps(
o, indent=4, sort_keys=True, ensure_ascii=True, separators=(",", ": "), cls=NumberEncoder
).encode("ascii")
111 changes: 83 additions & 28 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Literal, Mapping, Optional, Union, overload
from pathlib import Path

import ujson # type: ignore
import xarray as xr
Expand All @@ -9,7 +10,7 @@
import virtualizarr.kerchunk as kerchunk
from virtualizarr.kerchunk import KerchunkStoreRefs
from virtualizarr.manifests import ChunkManifest, ManifestArray

from virtualizarr.zarr import dataset_to_zarr, attrs_from_zarr_group_json, metadata_from_zarr_json

class ManifestBackendArray(ManifestArray, BackendArray):
"""Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc."""
Expand Down Expand Up @@ -37,7 +38,7 @@
File path to open as a set of virtualized zarr arrays.
filetype : str, default None
Type of file to be opened. Used to determine which kerchunk file format backend to use.
Can be one of {'netCDF3', 'netCDF4'}.
Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}.
If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
drop_variables: list[str], default is None
Variables in the file to drop before returning.
Expand All @@ -50,37 +51,88 @@
Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that.
"""

# this is the only place we actually always need to use kerchunk directly
vds_refs = kerchunk.read_kerchunk_references_from_file(
filepath=filepath,
filetype=filetype,
)
if drop_variables is None:
drop_variables = []

if virtual_array_class is not ManifestArray:
raise NotImplementedError()

Check warning on line 58 in virtualizarr/xarray.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/xarray.py#L58

Added line #L58 was not covered by tests

if filetype == "zarr_v3":
# TODO is there a neat way of auto-detecting this?
Comment on lines +85 to +86
Copy link
Member Author

@TomNicholas TomNicholas Mar 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit ugly - I want to automatically distinguish between non-zarr, zarr v2 (both to be read using kerchunk) and zarr v3 (to be read using this code). I guess I will just have to search for .zgroup/zarr.json files explicitly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@norlandrhagen do you have any thoughts on a neat way to handle this?

return open_virtual_dataset_from_v3_store(storepath=filepath, drop_variables=drop_variables, indexes=indexes)
else:
# this is the only place we actually always need to use kerchunk directly
vds_refs = kerchunk.read_kerchunk_references_from_file(
filepath=filepath,
filetype=filetype,
)

if indexes is None:
# add default indexes by reading data from file
# TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
# TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
ds = xr.open_dataset(filepath)
indexes = ds.xindexes
ds.close()

vds = dataset_from_kerchunk_refs(
vds_refs,
drop_variables=drop_variables,
virtual_array_class=virtual_array_class,
indexes=indexes,
)

# TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened

return vds


def open_virtual_dataset_from_v3_store(
storepath: str,
drop_variables: List[str],
indexes: Optional[Mapping[str, Index]],
) -> xr.Dataset:
"""
Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays.
"""
_storepath = Path(storepath)

ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json")

# TODO recursive glob to create a datatree
vars = {}
for array_dir in _storepath.glob("*/"):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somehow this is going awry in the CI, but working as intended locally

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that when run locally (on MacOS), a pathlib.Path.glob("*/") call only returns directories (as the pathlib docs say it will), but for some reason when run in this CI the glob will include files too. I've hacked around this by excluding any paths for which .is_file() is True.

var_name = array_dir.name
if var_name in drop_variables:
break

Check warning on line 107 in virtualizarr/xarray.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/xarray.py#L107

Added line #L107 was not covered by tests

zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json")
manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json"))

marr = ManifestArray(chunkmanifest=manifest, zarray=zarray)
var = xr.Variable(data=marr, dims=dim_names, attrs=attrs)
vars[var_name] = var

if indexes is None:
# add default indexes by reading data from file
# TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
# TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
ds = xr.open_dataset(filepath)
indexes = ds.xindexes
ds.close()

vds = dataset_from_kerchunk_refs(
vds_refs,
drop_variables=drop_variables,
virtual_array_class=virtual_array_class,
indexes=indexes,
)
raise NotImplementedError()

Check warning on line 117 in virtualizarr/xarray.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/xarray.py#L117

Added line #L117 was not covered by tests

# TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
data_vars, coords = separate_coords(vars, indexes)

ds = xr.Dataset(
data_vars,
coords=coords,
# indexes={}, # TODO should be added in a later version of xarray
attrs=ds_attrs,
)

return vds
return ds


def dataset_from_kerchunk_refs(
refs: KerchunkStoreRefs,
drop_variables: Optional[List[str]] = None,
virtual_array_class=ManifestArray,
indexes={},
drop_variables: List[str] = [],
virtual_array_class: type = ManifestArray,
indexes: Optional[Mapping[str, Index]] = None,
) -> xr.Dataset:
"""
Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays.
Expand Down Expand Up @@ -180,13 +232,16 @@
"""
Serialize all virtualized arrays in this xarray dataset as a Zarr store.

Currently requires all variables to be backed by ManifestArray objects.

Not very useful until some implementation of a Zarr reader can actually read these manifest.json files.
See https://github.com/zarr-developers/zarr-specs/issues/287

Parameters
----------
storepath : str
"""
raise NotImplementedError(
"No point in writing out these virtual arrays to Zarr until at least one Zarr reader can actually read them."
)
dataset_to_zarr(self.ds, storepath)

@overload
def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs:
Expand Down
Loading
Loading