zarr-developers · TomNicholas · May 1, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/docs/usage.md b/docs/usage.md
@@ -304,6 +304,25 @@ combined_ds = xr.open_dataset(mapper, engine="kerchunk")
 
 ### Writing as Zarr
 
-TODO: Write out references as a Zarr v3 store following the [Chunk Manifest ZEP](https://github.com/zarr-developers/zarr-specs/issues/287), see [PR #45](https://github.com/TomNicholas/VirtualiZarr/pull/45)
+Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr <virtualizarr.xarray.VirtualiZarrDatasetAccessor.to_zarr>` accessor method.
 
-TODO: Explanation of how this requires changes in zarr upstream to be able to read it
+```python
+combined_vds.virtualize.to_zarr('combined.zarr')
+```
+
+The result is a zarr v3 store on disk which contains the chunk manifest information written out as `manifest.json` files, so the store looks like this:
+
+```
+combined/zarr.json  <- group metadata
+combined/air/zarr.json  <- array metadata
+combined/air/manifest.json <- array manifest
+...
+```
+
+The advantage of this format is that any zarr v3 reader that understands the chunk manifest ZEP could read from this store, no matter what language it is written in (e.g. via `zarr-python`, `zarr-js`, or rust). This reading would also not require `fsspec`.
+
+```{note}
+Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this feature cannot be used for data processing.
+
+This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`.
+```
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -1,4 +1,5 @@
 import itertools
+import json
 import re
 from typing import Any, Iterable, Iterator, List, Mapping, Tuple, Union, cast
 
@@ -110,14 +111,19 @@ def dict(self) -> dict[str, dict[str, Union[str, int]]]:
         """Converts the entire manifest to a nested dictionary."""
         return {k: dict(entry) for k, entry in self.entries.items()}
 
-    @staticmethod
-    def from_zarr_json(filepath: str) -> "ChunkManifest":
+    @classmethod
+    def from_zarr_json(cls, filepath: str) -> "ChunkManifest":
         """Create a ChunkManifest from a Zarr manifest.json file."""
-        raise NotImplementedError()
+        with open(filepath, "r") as manifest_file:
+            entries_dict = json.load(manifest_file)
+
+        entries = {cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items()}
+        return cls(entries=entries)
 
     def to_zarr_json(self, filepath: str) -> None:
         """Write a ChunkManifest to a Zarr manifest.json file."""
-        raise NotImplementedError()
+        with open(filepath, "w") as json_file:
+            json.dump(self.dict(), json_file, indent=4, separators=(", ", ": "))
 
     @classmethod
     def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":

diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
@@ -0,0 +1,27 @@
+import xarray as xr
+import numpy as np
+import xarray.testing as xrt
+from virtualizarr import open_virtual_dataset, ManifestArray
+from virtualizarr.manifests.manifest import ChunkEntry
+
+
+def test_zarr_v3_roundtrip(tmpdir):
+    arr = ManifestArray(
+            chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)},
+            zarray=dict(
+                shape=(2, 3),
+                dtype=np.dtype("<i8"),
+                chunks=(2, 3),
+                compressor=None,
+                filters=None,
+                fill_value=None,
+                order="C",
+                zarr_format=3,
+            ),
+        )
+    original = xr.Dataset({"a": (["x", "y"], arr)})
+
+    original.virtualize.to_zarr(tmpdir / "store.zarr")
+    roundtrip = open_virtual_dataset(tmpdir / "store.zarr", filetype="zarr_v3", indexes={})
+
+    xrt.assert_identical(roundtrip, original)
diff --git a/virtualizarr/vendor/__init__.py b/virtualizarr/vendor/__init__.py
diff --git a/virtualizarr/vendor/zarr/LICENSE.txt b/virtualizarr/vendor/zarr/LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015-2024 Zarr Developers <https://github.com/zarr-developers>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/virtualizarr/vendor/zarr/__init__.py b/virtualizarr/vendor/zarr/__init__.py
diff --git a/virtualizarr/vendor/zarr/utils.py b/virtualizarr/vendor/zarr/utils.py
@@ -0,0 +1,22 @@
+import json
+import numbers
+
+from typing import Any
+
+
+class NumberEncoder(json.JSONEncoder):
+    def default(self, o):
+        # See json.JSONEncoder.default docstring for explanation
+        # This is necessary to encode numpy dtype
+        if isinstance(o, numbers.Integral):
+            return int(o)
+        if isinstance(o, numbers.Real):
+            return float(o)
+        return json.JSONEncoder.default(self, o)
+
+
+def json_dumps(o: Any) -> bytes:
+    """Write JSON in a consistent, human-readable way."""
+    return json.dumps(
+        o, indent=4, sort_keys=True, ensure_ascii=True, separators=(",", ": "), cls=NumberEncoder
+    ).encode("ascii")
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
@@ -1,4 +1,5 @@
 from typing import List, Literal, Mapping, Optional, Union, overload
+from pathlib import Path
 
 import ujson  # type: ignore
 import xarray as xr
@@ -9,7 +10,7 @@
 import virtualizarr.kerchunk as kerchunk
 from virtualizarr.kerchunk import KerchunkStoreRefs
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-
+from virtualizarr.zarr import dataset_to_zarr, attrs_from_zarr_group_json, metadata_from_zarr_json
 
 class ManifestBackendArray(ManifestArray, BackendArray):
     """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc."""
@@ -37,7 +38,7 @@
         File path to open as a set of virtualized zarr arrays.
     filetype : str, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
-        Can be one of {'netCDF3', 'netCDF4'}.
+        Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}.
         If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
     drop_variables: list[str], default is None
         Variables in the file to drop before returning.
@@ -50,37 +51,88 @@
         Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that.
     """
 
-    # this is the only place we actually always need to use kerchunk directly
-    vds_refs = kerchunk.read_kerchunk_references_from_file(
-        filepath=filepath,
-        filetype=filetype,
-    )
+    if drop_variables is None:
+        drop_variables = []
+
+    if virtual_array_class is not ManifestArray:
+        raise NotImplementedError()
+
+    if filetype == "zarr_v3":
+        # TODO is there a neat way of auto-detecting this?
+        return open_virtual_dataset_from_v3_store(storepath=filepath, drop_variables=drop_variables, indexes=indexes)
+    else:
+        # this is the only place we actually always need to use kerchunk directly
+        vds_refs = kerchunk.read_kerchunk_references_from_file(
+            filepath=filepath,
+            filetype=filetype,
+        )
+
+        if indexes is None:
+            # add default indexes by reading data from file
+            # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+            # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+            ds = xr.open_dataset(filepath)
+            indexes = ds.xindexes
+            ds.close()
+
+        vds = dataset_from_kerchunk_refs(
+            vds_refs,
+            drop_variables=drop_variables,
+            virtual_array_class=virtual_array_class,
+            indexes=indexes,
+        )
+
+        # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+
+        return vds
+
+
+def open_virtual_dataset_from_v3_store(
+    storepath: str,
+    drop_variables: List[str],
+    indexes: Optional[Mapping[str, Index]],
+) -> xr.Dataset:
+    """
+    Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays.
+    """
+    _storepath = Path(storepath)
+
+    ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json")
+
+    # TODO recursive glob to create a datatree
+    vars = {}
+    for array_dir in _storepath.glob("*/"):
+        var_name = array_dir.name
+        if var_name in drop_variables:
+            break
+
+        zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json")
+        manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json"))
+
+        marr = ManifestArray(chunkmanifest=manifest, zarray=zarray)
+        var = xr.Variable(data=marr, dims=dim_names, attrs=attrs)
+        vars[var_name] = var
 
     if indexes is None:
-        # add default indexes by reading data from file
-        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
-        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
-        ds = xr.open_dataset(filepath)
-        indexes = ds.xindexes
-        ds.close()
-
-    vds = dataset_from_kerchunk_refs(
-        vds_refs,
-        drop_variables=drop_variables,
-        virtual_array_class=virtual_array_class,
-        indexes=indexes,
-    )
+        raise NotImplementedError()
 
-    # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+    data_vars, coords = separate_coords(vars, indexes)
+
+    ds = xr.Dataset(
+        data_vars,
+        coords=coords,
+        # indexes={},  # TODO should be added in a later version of xarray
+        attrs=ds_attrs,
+    )
 
-    return vds
+    return ds
 
 
 def dataset_from_kerchunk_refs(
     refs: KerchunkStoreRefs,
-    drop_variables: Optional[List[str]] = None,
-    virtual_array_class=ManifestArray,
-    indexes={},
+    drop_variables: List[str] = [],
+    virtual_array_class: type = ManifestArray,
+    indexes: Optional[Mapping[str, Index]] = None,
 ) -> xr.Dataset:
     """
     Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays.
@@ -180,13 +232,16 @@
         """
         Serialize all virtualized arrays in this xarray dataset as a Zarr store.
 
+        Currently requires all variables to be backed by ManifestArray objects.
+
+        Not very useful until some implementation of a Zarr reader can actually read these manifest.json files.
+        See https://github.com/zarr-developers/zarr-specs/issues/287
+
         Parameters
         ----------
         storepath : str
         """
-        raise NotImplementedError(
-            "No point in writing out these virtual arrays to Zarr until at least one Zarr reader can actually read them."
-        )
+        dataset_to_zarr(self.ds, storepath)
 
     @overload
     def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs: