From c4d4325811e52cb7d5b8ca3160e96a4e27012ebe Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 18 Jun 2024 14:22:58 -0600 Subject: [PATCH] Use 3 numpy arrays for manifest internally (#107) * change entries property to a structured array, add from_dict * fix validation * equals method * re-implemented concatenation through concatenation of the wrapped structured array * fixed manifest.from_kerchunk_dict * fixed kerchunk tests * change private attributes to 3 numpy arrays * add from_arrays method * to and from dict working again * fix dtype comparisons * depend on numpy release candidate * get concatenation and stacking working * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove manifest-level tests of concatenation * generalized create_manifestarray fixture * added tests of broadcasting * made basic broadcasting tests pass by calling np.broadcast_to on underlying numpy arrays * generalize fixture for creating scalar ManifestArrays * improve regression test for expanding scalar ManifestArray * remove now-unneeded scalar broadcasting logic * rewrite __eq__ method on ManifestArray * remove unused import * reinstate the ChunkManifest.__init__ method to accept dictionaries * hopefully fix broadcasting bug * add backwards compatibility for pre-numpy2.0 * depend on numpy>=2.0.0 * rewrote broadcast_to shape logic * remove faulty hypothesis strategies * remove hypothesis from dependencies * ignore remaining mypy errors * release notes * update dependencies in CI test env --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- ci/environment.yml | 4 +- docs/releases.rst | 4 + pyproject.toml | 4 +- virtualizarr/kerchunk.py | 4 +- virtualizarr/manifests/__init__.py | 7 +- virtualizarr/manifests/array.py | 26 +- virtualizarr/manifests/array_api.py | 176 ++++----- virtualizarr/manifests/manifest.py | 334 ++++++++++-------- virtualizarr/tests/test_kerchunk.py | 12 +- .../tests/test_manifests/test_array.py | 49 ++- .../tests/test_manifests/test_manifest.py | 78 +--- virtualizarr/tests/test_zarr.py | 6 +- virtualizarr/zarr.py | 28 +- 13 files changed, 399 insertions(+), 333 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index d8af6bf5..0385ea5a 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,10 +7,10 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2024.5.0 + - xarray>=2024.6.0 - kerchunk>=0.2.5 - pydantic - - numpy + - numpy>=2.0.0 - ujson - packaging - universal_pathlib diff --git a/docs/releases.rst b/docs/releases.rst index 2255ac10..7228ca66 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -13,6 +13,8 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- Requires numpy 2.0 (for :pull:`107`). + By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ @@ -29,6 +31,8 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Refactor `ChunkManifest` class to store chunk references internally using numpy arrays. + (:pull:`107`) By `Tom Nicholas `_. - Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest. (:pull:`144`) By `Tom Nicholas `_. diff --git a/pyproject.toml b/pyproject.toml index 9e1bde4e..075059cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,11 +21,11 @@ classifiers = [ requires-python = ">=3.10" dynamic = ["version"] dependencies = [ - "xarray>=2024.5.0", + "xarray>=2024.06.0", "kerchunk>=0.2.5", "h5netcdf", "pydantic", - "numpy", + "numpy>=2.0.0", "ujson", "packaging", "universal-pathlib", diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 9424ce40..ded76e3e 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -241,8 +241,8 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr marr = var.data arr_refs: dict[str, str | list[str | int]] = { - str(chunk_key): chunk_entry.to_kerchunk() - for chunk_key, chunk_entry in marr.manifest.entries.items() + str(chunk_key): [entry["path"], entry["offset"], entry["length"]] + for chunk_key, entry in marr.manifest.dict().items() } zarray = marr.zarray diff --git a/virtualizarr/manifests/__init__.py b/virtualizarr/manifests/__init__.py index 39f60c59..c317ed6a 100644 --- a/virtualizarr/manifests/__init__.py +++ b/virtualizarr/manifests/__init__.py @@ -2,9 +2,4 @@ # This is just to avoid conflicting with some type of file called manifest that .gitignore recommends ignoring. from .array import ManifestArray # type: ignore # noqa -from .manifest import ( # type: ignore # noqa - ChunkEntry, - ChunkManifest, - concat_manifests, - stack_manifests, -) +from .manifest import ChunkEntry, ChunkManifest # type: ignore # noqa diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index cfc15cca..ed07d8e8 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -50,9 +50,12 @@ def __init__( _chunkmanifest = ChunkManifest(entries=chunkmanifest) else: raise TypeError( - f"chunkmanifest arg must be of type ChunkManifest, but got type {type(chunkmanifest)}" + f"chunkmanifest arg must be of type ChunkManifest or dict, but got type {type(chunkmanifest)}" ) + # TODO check that the zarray shape and chunkmanifest shape are consistent with one another + # TODO also cover the special case of scalar arrays + self._zarray = _zarray self._manifest = _chunkmanifest @@ -140,7 +143,7 @@ def __eq__( # type: ignore[override] Returns a numpy array of booleans. """ - if isinstance(other, (int, float, bool)): + if isinstance(other, (int, float, bool, np.ndarray)): # TODO what should this do when comparing against numpy arrays? return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool)) elif not isinstance(other, ManifestArray): @@ -164,9 +167,22 @@ def __eq__( # type: ignore[override] UserWarning, ) - # TODO do chunk-wise comparison - # TODO expand it into an element-wise result - return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool)) + # do chunk-wise comparison + equal_chunk_paths = self.manifest._paths == other.manifest._paths + equal_chunk_offsets = self.manifest._offsets == other.manifest._offsets + equal_chunk_lengths = self.manifest._lengths == other.manifest._lengths + + equal_chunks = ( + equal_chunk_paths & equal_chunk_offsets & equal_chunk_lengths + ) + + if not equal_chunks.all(): + # TODO expand chunk-wise comparison into an element-wise result instead of just returning all False + return np.full( + shape=self.shape, fill_value=False, dtype=np.dtype(bool) + ) + else: + raise RuntimeWarning("Should not be possible to get here") def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray": """Cannot change the dtype, but needed because xarray will call this even when it's a no-op.""" diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 8dac514e..0ecdc023 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,11 +1,10 @@ -import itertools -from collections.abc import Callable, Iterable -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Callable, Iterable import numpy as np -from ..zarr import Codec, ZArray -from .manifest import concat_manifests, stack_manifests +from virtualizarr.zarr import Codec, ceildiv + +from .manifest import ChunkManifest if TYPE_CHECKING: from .array import ManifestArray @@ -125,21 +124,28 @@ def concatenate( new_shape = list(first_shape) new_shape[axis] = new_length_along_concat_axis - concatenated_manifest = concat_manifests( - [arr.manifest for arr in arrays], + # do concatenation of entries in manifest + concatenated_paths = np.concatenate( + [arr.manifest._paths for arr in arrays], + axis=axis, + ) + concatenated_offsets = np.concatenate( + [arr.manifest._offsets for arr in arrays], + axis=axis, + ) + concatenated_lengths = np.concatenate( + [arr.manifest._lengths for arr in arrays], axis=axis, ) + concatenated_manifest = ChunkManifest.from_arrays( + paths=concatenated_paths, + offsets=concatenated_offsets, + lengths=concatenated_lengths, + ) - new_zarray = ZArray( - chunks=first_arr.chunks, - compressor=first_arr.zarray.compressor, - dtype=first_arr.dtype, - fill_value=first_arr.zarray.fill_value, - filters=first_arr.zarray.filters, - shape=new_shape, - # TODO presumably these things should be checked for consistency across arrays too? - order=first_arr.zarray.order, - zarr_format=first_arr.zarray.zarr_format, + # chunk shape has not changed, there are just now more chunks along the concatenation axis + new_zarray = first_arr.zarray.replace( + shape=tuple(new_shape), ) return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray) @@ -210,26 +216,33 @@ def stack( new_shape = list(first_shape) new_shape.insert(axis, length_along_new_stacked_axis) - stacked_manifest = stack_manifests( - [arr.manifest for arr in arrays], + # do stacking of entries in manifest + stacked_paths = np.stack( + [arr.manifest._paths for arr in arrays], + axis=axis, + ) + stacked_offsets = np.stack( + [arr.manifest._offsets for arr in arrays], + axis=axis, + ) + stacked_lengths = np.stack( + [arr.manifest._lengths for arr in arrays], axis=axis, ) + stacked_manifest = ChunkManifest.from_arrays( + paths=stacked_paths, + offsets=stacked_offsets, + lengths=stacked_lengths, + ) - # chunk size has changed because a length-1 axis has been inserted + # chunk shape has changed because a length-1 axis has been inserted old_chunks = first_arr.chunks new_chunks = list(old_chunks) new_chunks.insert(axis, 1) - new_zarray = ZArray( - chunks=new_chunks, - compressor=first_arr.zarray.compressor, - dtype=first_arr.dtype, - fill_value=first_arr.zarray.fill_value, - filters=first_arr.zarray.filters, - shape=new_shape, - # TODO presumably these things should be checked for consistency across arrays too? - order=first_arr.zarray.order, - zarr_format=first_arr.zarray.zarr_format, + new_zarray = first_arr.zarray.replace( + chunks=tuple(new_chunks), + shape=tuple(new_shape), ) return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray) @@ -254,74 +267,65 @@ def expand_dims(x: "ManifestArray", /, *, axis: int = 0) -> "ManifestArray": @implements(np.broadcast_to) def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArray": """ - Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries. + Broadcasts a ManifestArray to a specified shape, by either adjusting chunk keys or copying chunk manifest entries. """ - if len(x.shape) > len(shape): - raise ValueError("input operand has more dimensions than allowed") - - # numpy broadcasting algorithm requires us to start by comparing the length of the final axes and work backwards - result = x - for axis, d, d_requested in itertools.zip_longest( - reversed(range(len(shape))), reversed(x.shape), reversed(shape), fillvalue=None - ): - # len(shape) check above ensures this can't be type None - d_requested = cast(int, d_requested) - - if d == d_requested: - pass - elif d is None: - if result.shape == (): - # scalars are a special case because their manifests already have a chunk key with one dimension - # see https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282 - result = _broadcast_scalar(result, new_axis_length=d_requested) - else: - # stack same array upon itself d_requested number of times, which inserts a new axis at axis=0 - result = stack([result] * d_requested, axis=0) - elif d == 1: - # concatenate same array upon itself d_requested number of times along existing axis - result = concatenate([result] * d_requested, axis=axis) - else: - raise ValueError( - f"Array with shape {x.shape} cannot be broadcast to shape {shape}" - ) - - return result - - -def _broadcast_scalar(x: "ManifestArray", new_axis_length: int) -> "ManifestArray": - """ - Add an axis to a scalar ManifestArray, but without adding a new axis to the keys of the chunk manifest. + from .array import ManifestArray - This is not the same as concatenation, because there is no existing axis along which to concatenate. - It's also not the same as stacking, because we don't want to insert a new axis into the chunk keys. + new_shape = shape - Scalars are a special case because their manifests still have a chunk key with one dimension. - See https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282 - """ + # check its actually possible to broadcast to this new shape + mutually_broadcastable_shape = np.broadcast_shapes(x.shape, new_shape) + if mutually_broadcastable_shape != new_shape: + # we're not trying to broadcast both shapes to a third shape + raise ValueError( + f"array of shape {x.shape} cannot be broadcast to shape {new_shape}" + ) - from .array import ManifestArray + # new chunk_shape is old chunk_shape with singleton dimensions pre-pended + # (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements) + old_chunk_shape = x.chunks + new_chunk_shape = _prepend_singleton_dimensions( + old_chunk_shape, ndim=len(new_shape) + ) - new_shape = (new_axis_length,) - new_chunks = (new_axis_length,) + # find new chunk grid shape by dividing new array shape by new chunk shape + new_chunk_grid_shape = tuple( + ceildiv(axis_length, chunk_length) + for axis_length, chunk_length in zip(new_shape, new_chunk_shape) + ) - concatenated_manifest = concat_manifests( - [x.manifest] * new_axis_length, - axis=0, + # do broadcasting of entries in manifest + broadcasted_paths = np.broadcast_to( + x.manifest._paths, + shape=new_chunk_grid_shape, + ) + broadcasted_offsets = np.broadcast_to( + x.manifest._offsets, + shape=new_chunk_grid_shape, + ) + broadcasted_lengths = np.broadcast_to( + x.manifest._lengths, + shape=new_chunk_grid_shape, + ) + broadcasted_manifest = ChunkManifest.from_arrays( + paths=broadcasted_paths, + offsets=broadcasted_offsets, + lengths=broadcasted_lengths, ) - new_zarray = ZArray( - chunks=new_chunks, - compressor=x.zarray.compressor, - dtype=x.dtype, - fill_value=x.zarray.fill_value, - filters=x.zarray.filters, + new_zarray = x.zarray.replace( + chunks=new_chunk_shape, shape=new_shape, - order=x.zarray.order, - zarr_format=x.zarray.zarr_format, ) - return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray) + return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray) + + +def _prepend_singleton_dimensions(shape: tuple[int, ...], ndim: int) -> tuple[int, ...]: + """Prepend as many new length-1 axes to shape as necessary such that the result has ndim number of axes.""" + n_prepended_dims = ndim - len(shape) + return tuple([1] * n_prepended_dims + list(shape)) # TODO broadcast_arrays, squeeze, permute_dims diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index c0a95c67..530ce17c 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,13 +1,12 @@ -import itertools import json import re -from collections.abc import Iterable, Iterator, Mapping -from typing import Any, cast +from collections.abc import Iterable, Iterator +from typing import Any, NewType, Tuple, Union, cast import numpy as np -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import BaseModel, ConfigDict -from ..types import ChunkKey +from virtualizarr.types import ChunkKey _INTEGER = ( r"([1-9]+\d*|0)" # matches 0 or an unsigned integer that does not begin with zero @@ -16,6 +15,9 @@ _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) +ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]) + + class ChunkEntry(BaseModel): """ Information for a single chunk in the manifest. @@ -41,13 +43,17 @@ def to_kerchunk(self) -> list[str | int]: """Write out in the format that kerchunk uses for chunk entries.""" return [self.path, self.offset, self.length] + def dict(self) -> dict[str, Union[str, int]]: + return dict(path=self.path, offset=self.offset, length=self.length) + -class ChunkManifest(BaseModel): +class ChunkManifest: """ In-memory representation of a single Zarr chunk manifest. - Stores the manifest as a dictionary under the .chunks attribute, in this form: + Stores the manifest internally as numpy arrays, so the most efficient way to create this object is via the `.from_arrays` constructor classmethod. + The manifest can be converted to or from a dictionary which looks like this { "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, @@ -56,25 +62,131 @@ class ChunkManifest(BaseModel): "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } + using the .__init__() and .dict() methods, so users of this class can think of the manifest as if it were a dict mapping zarr chunk keys to byte ranges. - See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287 . + (See the chunk manifest SPEC proposal in https://github.com/zarr-developers/zarr-specs/issues/287.) - Validation is done when this object is instatiated, and this class is immutable, - so it's not possible to have a ChunkManifest object that does not represent a complete valid grid of chunks. + Validation is done when this object is instantiated, and this class is immutable, + so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks. """ - model_config = ConfigDict(frozen=True) + _paths: np.ndarray[Any, np.dtypes.StringDType] # type: ignore[name-defined] + _offsets: np.ndarray[Any, np.dtype[np.int32]] + _lengths: np.ndarray[Any, np.dtype[np.int32]] + + def __init__(self, entries: dict) -> None: + """ + Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges. + + Parameters + ---------- + entries: dict + Chunk keys and byte range information, as a dictionary of the form + + { + "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, + "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, + } + """ - entries: Mapping[ChunkKey, ChunkEntry] - # shape_chunk_grid: Tuple[int, ...] # TODO do we need this for anything? + # TODO do some input validation here first? + validate_chunk_keys(entries.keys()) + + # TODO should we actually optionally pass chunk grid shape in, + # in case there are not enough chunks to give correct idea of full shape? + shape = get_chunk_grid_shape(entries.keys()) + + # Initializing to empty implies that entries with path='' are treated as missing chunks + paths = np.empty(shape=shape, dtype=np.dtypes.StringDType()) # type: ignore[attr-defined] + offsets = np.empty(shape=shape, dtype=np.dtype("int32")) + lengths = np.empty(shape=shape, dtype=np.dtype("int32")) + + # populate the arrays + for key, entry in entries.items(): + try: + path, offset, length = entry.values() + entry = ChunkEntry(path=path, offset=offset, length=length) + except (ValueError, TypeError) as e: + msg = ( + "Each chunk entry must be of the form dict(path=, offset=, length=), " + f"but got {entry}" + ) + raise ValueError(msg) from e + + split_key = split(key) + paths[split_key] = entry.path + offsets[split_key] = entry.offset + lengths[split_key] = entry.length + + self._paths = paths + self._offsets = offsets + self._lengths = lengths - @field_validator("entries") @classmethod - def validate_chunks(cls, entries: Any) -> Mapping[ChunkKey, ChunkEntry]: - validate_chunk_keys(list(entries.keys())) + def from_arrays( + cls, + paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], # type: ignore[name-defined] + offsets: np.ndarray[Any, np.dtype[np.int32]], + lengths: np.ndarray[Any, np.dtype[np.int32]], + ) -> "ChunkManifest": + """ + Create manifest directly from numpy arrays containing the path and byte range information. + + Useful if you want to avoid the memory overhead of creating an intermediate dictionary first, + as these 3 arrays are what will be used internally to store the references. + + Parameters + ---------- + paths: np.ndarray + offsets: np.ndarray + lengths: np.ndarray + """ + + # check types + if not isinstance(paths, np.ndarray): + raise TypeError(f"paths must be a numpy array, but got type {type(paths)}") + if not isinstance(offsets, np.ndarray): + raise TypeError( + f"offsets must be a numpy array, but got type {type(offsets)}" + ) + if not isinstance(lengths, np.ndarray): + raise TypeError( + f"lengths must be a numpy array, but got type {type(lengths)}" + ) + + # check dtypes + if paths.dtype != np.dtypes.StringDType(): # type: ignore[attr-defined] + raise ValueError( + f"paths array must have a numpy variable-length string dtype, but got dtype {paths.dtype}" + ) + if offsets.dtype != np.dtype("int32"): + raise ValueError( + f"offsets array must have 32-bit integer dtype, but got dtype {offsets.dtype}" + ) + if lengths.dtype != np.dtype("int32"): + raise ValueError( + f"lengths array must have 32-bit integer dtype, but got dtype {lengths.dtype}" + ) + + # check shapes + shape = paths.shape + if offsets.shape != shape: + raise ValueError( + f"Shapes of the arrays must be consistent, but shapes of paths array and offsets array do not match: {paths.shape} vs {offsets.shape}" + ) + if lengths.shape != shape: + raise ValueError( + f"Shapes of the arrays must be consistent, but shapes of paths array and lengths array do not match: {paths.shape} vs {lengths.shape}" + ) + + obj = object.__new__(cls) + obj._paths = paths + obj._offsets = offsets + obj._lengths = lengths - # TODO what if pydantic adjusts anything during validation? - return entries + return obj @property def ndim_chunk_grid(self) -> int: @@ -83,7 +195,7 @@ def ndim_chunk_grid(self) -> int: Not the same as the dimension of an array backed by this chunk manifest. """ - return get_ndim_from_key(list(self.entries.keys())[0]) + return self._paths.ndim @property def shape_chunk_grid(self) -> tuple[int, ...]: @@ -92,54 +204,99 @@ def shape_chunk_grid(self) -> tuple[int, ...]: Not the same as the shape of an array backed by this chunk manifest. """ - return get_chunk_grid_shape(list(self.entries.keys())) + return self._paths.shape def __repr__(self) -> str: return f"ChunkManifest" def __getitem__(self, key: ChunkKey) -> ChunkEntry: - return self.entries[key] + indices = split(key) + path = self._paths[indices] + offset = self._offsets[indices] + length = self._lengths[indices] + return ChunkEntry(path=path, offset=offset, length=length) def __iter__(self) -> Iterator[ChunkKey]: - return iter(self.entries.keys()) + # TODO make this work for numpy arrays + raise NotImplementedError() + # return iter(self._paths.keys()) def __len__(self) -> int: - return len(self.entries) + return self._paths.size + + def dict(self) -> ChunkDict: + """ + Convert the entire manifest to a nested dictionary. + + The returned dict will be of the form + + { + "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, + "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, + } + + Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary. + """ - def dict(self) -> dict[str, dict[str, str | int]]: - """Converts the entire manifest to a nested dictionary.""" - return {k: dict(entry) for k, entry in self.entries.items()} + coord_vectors = np.mgrid[ + tuple(slice(None, length) for length in self.shape_chunk_grid) + ] + + d = { + join(inds): dict( + path=path.item(), offset=offset.item(), length=length.item() + ) + for *inds, path, offset, length in np.nditer( + [*coord_vectors, self._paths, self._offsets, self._lengths], + flags=("refs_ok",), + ) + if path.item()[0] != "" # don't include entry if path='' (i.e. empty chunk) + } + + return cast( + ChunkDict, + d, + ) + + def __eq__(self, other: Any) -> bool: + """Two manifests are equal if all of their entries are identical.""" + paths_equal = (self._paths == other._paths).all() + offsets_equal = (self._offsets == other._offsets).all() + lengths_equal = (self._lengths == other._lengths).all() + return paths_equal and offsets_equal and lengths_equal @classmethod def from_zarr_json(cls, filepath: str) -> "ChunkManifest": """Create a ChunkManifest from a Zarr manifest.json file.""" - with open(filepath) as manifest_file: - entries_dict = json.load(manifest_file) - entries = { - cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items() - } + with open(filepath, "r") as manifest_file: + entries = json.load(manifest_file) + return cls(entries=entries) def to_zarr_json(self, filepath: str) -> None: - """Write a ChunkManifest to a Zarr manifest.json file.""" + """Write the manifest to a Zarr manifest.json file.""" + entries = self.dict() with open(filepath, "w") as json_file: - json.dump(self.dict(), json_file, indent=4, separators=(", ", ": ")) + json.dump(entries, json_file, indent=4, separators=(", ", ": ")) @classmethod def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": chunkentries = { - k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items() + cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict() + for k, v in kerchunk_chunk_dict.items() } - return ChunkManifest(entries=chunkentries) + return ChunkManifest(entries=cast(ChunkDict, chunkentries)) -def split(key: ChunkKey) -> list[int]: - return list(int(i) for i in key.split(".")) +def split(key: ChunkKey) -> Tuple[int, ...]: + return tuple(int(i) for i in key.split(".")) -def join(inds: Iterable[int]) -> ChunkKey: - return cast(ChunkKey, ".".join(str(i) for i in inds)) +def join(inds: Iterable[Any]) -> ChunkKey: + return cast(ChunkKey, ".".join(str(i) for i in list(inds))) def get_ndim_from_key(key: str) -> int: @@ -163,9 +320,6 @@ def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]): f"Inconsistent number of dimensions between chunk key {key} and {first_key}: {other_ndim} vs {ndim}" ) - # Check that the keys collectively form a complete grid - check_keys_form_grid(chunk_keys) - def get_chunk_grid_shape(chunk_keys: Iterable[ChunkKey]) -> tuple[int, ...]: # find max chunk index along each dimension @@ -174,99 +328,3 @@ def get_chunk_grid_shape(chunk_keys: Iterable[ChunkKey]) -> tuple[int, ...]: max(indices_along_one_dim) + 1 for indices_along_one_dim in zipped_indices ) return chunk_grid_shape - - -def check_keys_form_grid(chunk_keys: Iterable[ChunkKey]): - """Check that the chunk keys collectively form a complete grid""" - - chunk_grid_shape = get_chunk_grid_shape(chunk_keys) - - # create every possible combination - all_possible_combos = itertools.product( - *[range(length) for length in chunk_grid_shape] - ) - all_required_chunk_keys: set[ChunkKey] = { - join(inds) for inds in all_possible_combos - } - - # check that every possible combination is represented once in the list of chunk keys - if set(chunk_keys) != all_required_chunk_keys: - raise ValueError("Chunk keys do not form a complete grid") - - -def concat_manifests(manifests: list["ChunkManifest"], axis: int) -> "ChunkManifest": - """ - Concatenate manifests along an existing dimension. - - This only requires adjusting one index of chunk keys along a single dimension. - - Note axis is not expected to be negative. - """ - if len(manifests) == 1: - return manifests[0] - - chunk_grid_shapes = [manifest.shape_chunk_grid for manifest in manifests] - lengths_along_concat_dim = [shape[axis] for shape in chunk_grid_shapes] - - # Note we do not need to change the keys of the first manifest - chunk_index_offsets = np.cumsum(lengths_along_concat_dim)[:-1] - new_entries = [ - adjust_chunk_keys(manifest.entries, axis, offset) - for manifest, offset in zip(manifests[1:], chunk_index_offsets) - ] - all_entries = [manifests[0].entries] + new_entries - merged_entries = {k: v for d in all_entries for k, v in d.items()} - - # Arguably don't need to re-perform validation checks on a manifest we created out of already-validated manifests - # Could use pydantic's model_construct classmethod to skip these checks - # But we should actually performance test it because it might be pointless, and current implementation is safer - return ChunkManifest(entries=merged_entries) - - -def adjust_chunk_keys( - entries: Mapping[ChunkKey, ChunkEntry], axis: int, offset: int -) -> Mapping[ChunkKey, ChunkEntry]: - """Replace all chunk keys with keys which have been offset along one axis.""" - - def offset_key(key: ChunkKey, axis: int, offset: int) -> ChunkKey: - inds = split(key) - inds[axis] += offset - return join(inds) - - return {offset_key(k, axis, offset): v for k, v in entries.items()} - - -def stack_manifests(manifests: list[ChunkManifest], axis: int) -> "ChunkManifest": - """ - Stack manifests along a new dimension. - - This only requires inserting one index into all chunk keys to add a new dimension. - - Note axis is not expected to be negative. - """ - - # even if there is only one manifest it still needs a new axis inserted - chunk_indexes_along_new_dim = range(len(manifests)) - new_entries = [ - insert_new_axis_into_chunk_keys(manifest.entries, axis, new_index_value) - for manifest, new_index_value in zip(manifests, chunk_indexes_along_new_dim) - ] - merged_entries = {k: v for d in new_entries for k, v in d.items()} - - # Arguably don't need to re-perform validation checks on a manifest we created out of already-validated manifests - # Could use pydantic's model_construct classmethod to skip these checks - # But we should actually performance test it because it might be pointless, and current implementation is safer - return ChunkManifest(entries=merged_entries) - - -def insert_new_axis_into_chunk_keys( - entries: Mapping[ChunkKey, ChunkEntry], axis: int, new_index_value: int -) -> Mapping[ChunkKey, ChunkEntry]: - """Replace all chunk keys with keys which have a new axis inserted, with a given value.""" - - def insert_axis(key: ChunkKey, new_axis: int, index_value: int) -> ChunkKey: - inds = split(key) - inds.insert(new_axis, index_value) - return join(inds) - - return {insert_axis(k, axis, new_index_value): v for k, v in entries.items()} diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index c255a8d8..a623bc02 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -6,7 +6,7 @@ import xarray.testing as xrt from virtualizarr.kerchunk import FileType, _automatically_determine_filetype -from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.xarray import dataset_from_kerchunk_refs @@ -68,8 +68,11 @@ def test_dataset_from_df_refs_with_filters(): class TestAccessor: def test_accessor_to_kerchunk_dict(self): + manifest = ChunkManifest( + entries={"0.0": dict(path="test.nc", offset=6144, length=48)} + ) arr = ManifestArray( - chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)}, + chunkmanifest=manifest, zarray=dict( shape=(2, 3), dtype=np.dtype(" str: def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": # coerce type of fill_value as kerchunk can be inconsistent with this fill_value = decoded_arr_refs_zarray["fill_value"] - if fill_value is None or fill_value == "NaN": + if fill_value is None or fill_value == "NaN" or fill_value == "nan": fill_value = np.nan return ZArray( @@ -102,6 +103,31 @@ def dict(self) -> dict[str, Any]: def to_kerchunk_json(self) -> str: return ujson.dumps(self.dict()) + def replace( + self, + chunks: Optional[tuple[int, ...]] = None, + compressor: Optional[str] = None, + dtype: Optional[np.dtype] = None, + fill_value: Optional[float] = None, # float or int? + filters: Optional[list[dict]] = None, # type: ignore[valid-type] + order: Optional[Literal["C"] | Literal["F"]] = None, + shape: Optional[tuple[int, ...]] = None, + zarr_format: Optional[Literal[2] | Literal[3]] = None, + ) -> "ZArray": + """ + Convenience method to create a new ZArray from an existing one by altering only certain attributes. + """ + return ZArray( + chunks=chunks if chunks is not None else self.chunks, + compressor=compressor if compressor is not None else self.compressor, + dtype=dtype if dtype is not None else self.dtype, + fill_value=fill_value if fill_value is not None else self.fill_value, + filters=filters if filters is not None else self.filters, + shape=shape if shape is not None else self.shape, + order=order if order is not None else self.order, + zarr_format=zarr_format if zarr_format is not None else self.zarr_format, + ) + def encode_dtype(dtype: np.dtype) -> str: # TODO not sure if there is a better way to get the '