Skip to content

Commit

Permalink
Use 3 numpy arrays for manifest internally (#107)
Browse files Browse the repository at this point in the history
* change entries property to a structured array, add from_dict

* fix validation

* equals method

* re-implemented concatenation through concatenation of the wrapped structured array

* fixed manifest.from_kerchunk_dict

* fixed kerchunk tests

* change private attributes to 3 numpy arrays

* add from_arrays method

* to and from dict working again

* fix dtype comparisons

* depend on numpy release candidate

* get concatenation and stacking working

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove manifest-level tests of concatenation

* generalized create_manifestarray fixture

* added tests of broadcasting

* made basic broadcasting tests pass by calling np.broadcast_to on underlying numpy arrays

* generalize fixture for creating scalar ManifestArrays

* improve regression test for expanding scalar ManifestArray

* remove now-unneeded scalar broadcasting logic

* rewrite __eq__ method on ManifestArray

* remove unused import

* reinstate the ChunkManifest.__init__ method to accept dictionaries

* hopefully fix broadcasting bug

* add backwards compatibility for pre-numpy2.0

* depend on numpy>=2.0.0

* rewrote broadcast_to shape logic

* remove faulty hypothesis strategies

* remove hypothesis from dependencies

* ignore remaining mypy errors

* release notes

* update dependencies in CI test env

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
TomNicholas and pre-commit-ci[bot] authored Jun 18, 2024
1 parent 27a6f94 commit c4d4325
Show file tree
Hide file tree
Showing 13 changed files with 399 additions and 333 deletions.
4 changes: 2 additions & 2 deletions ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ dependencies:
- h5py
- hdf5
- netcdf4
- xarray>=2024.5.0
- xarray>=2024.6.0
- kerchunk>=0.2.5
- pydantic
- numpy
- numpy>=2.0.0
- ujson
- packaging
- universal_pathlib
Expand Down
4 changes: 4 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ New Features
Breaking changes
~~~~~~~~~~~~~~~~

- Requires numpy 2.0 (for :pull:`107`).
By `Tom Nicholas <https://github.com/TomNicholas>`_.

Deprecations
~~~~~~~~~~~~
Expand All @@ -29,6 +31,8 @@ Documentation
Internal Changes
~~~~~~~~~~~~~~~~

- Refactor `ChunkManifest` class to store chunk references internally using numpy arrays.
(:pull:`107`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest.
(:pull:`144`) By `Tom Nicholas <https://github.com/TomNicholas>`_.

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ classifiers = [
requires-python = ">=3.10"
dynamic = ["version"]
dependencies = [
"xarray>=2024.5.0",
"xarray>=2024.06.0",
"kerchunk>=0.2.5",
"h5netcdf",
"pydantic",
"numpy",
"numpy>=2.0.0",
"ujson",
"packaging",
"universal-pathlib",
Expand Down
4 changes: 2 additions & 2 deletions virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,8 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
marr = var.data

arr_refs: dict[str, str | list[str | int]] = {
str(chunk_key): chunk_entry.to_kerchunk()
for chunk_key, chunk_entry in marr.manifest.entries.items()
str(chunk_key): [entry["path"], entry["offset"], entry["length"]]
for chunk_key, entry in marr.manifest.dict().items()
}

zarray = marr.zarray
Expand Down
7 changes: 1 addition & 6 deletions virtualizarr/manifests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,4 @@
# This is just to avoid conflicting with some type of file called manifest that .gitignore recommends ignoring.

from .array import ManifestArray # type: ignore # noqa
from .manifest import ( # type: ignore # noqa
ChunkEntry,
ChunkManifest,
concat_manifests,
stack_manifests,
)
from .manifest import ChunkEntry, ChunkManifest # type: ignore # noqa
26 changes: 21 additions & 5 deletions virtualizarr/manifests/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,12 @@ def __init__(
_chunkmanifest = ChunkManifest(entries=chunkmanifest)
else:
raise TypeError(
f"chunkmanifest arg must be of type ChunkManifest, but got type {type(chunkmanifest)}"
f"chunkmanifest arg must be of type ChunkManifest or dict, but got type {type(chunkmanifest)}"
)

# TODO check that the zarray shape and chunkmanifest shape are consistent with one another
# TODO also cover the special case of scalar arrays

self._zarray = _zarray
self._manifest = _chunkmanifest

Expand Down Expand Up @@ -140,7 +143,7 @@ def __eq__( # type: ignore[override]
Returns a numpy array of booleans.
"""
if isinstance(other, (int, float, bool)):
if isinstance(other, (int, float, bool, np.ndarray)):
# TODO what should this do when comparing against numpy arrays?
return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool))
elif not isinstance(other, ManifestArray):
Expand All @@ -164,9 +167,22 @@ def __eq__( # type: ignore[override]
UserWarning,
)

# TODO do chunk-wise comparison
# TODO expand it into an element-wise result
return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool))
# do chunk-wise comparison
equal_chunk_paths = self.manifest._paths == other.manifest._paths
equal_chunk_offsets = self.manifest._offsets == other.manifest._offsets
equal_chunk_lengths = self.manifest._lengths == other.manifest._lengths

equal_chunks = (
equal_chunk_paths & equal_chunk_offsets & equal_chunk_lengths
)

if not equal_chunks.all():
# TODO expand chunk-wise comparison into an element-wise result instead of just returning all False
return np.full(
shape=self.shape, fill_value=False, dtype=np.dtype(bool)
)
else:
raise RuntimeWarning("Should not be possible to get here")

def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray":
"""Cannot change the dtype, but needed because xarray will call this even when it's a no-op."""
Expand Down
176 changes: 90 additions & 86 deletions virtualizarr/manifests/array_api.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import itertools
from collections.abc import Callable, Iterable
from typing import TYPE_CHECKING, cast
from typing import TYPE_CHECKING, Callable, Iterable

import numpy as np

from ..zarr import Codec, ZArray
from .manifest import concat_manifests, stack_manifests
from virtualizarr.zarr import Codec, ceildiv

from .manifest import ChunkManifest

if TYPE_CHECKING:
from .array import ManifestArray
Expand Down Expand Up @@ -125,21 +124,28 @@ def concatenate(
new_shape = list(first_shape)
new_shape[axis] = new_length_along_concat_axis

concatenated_manifest = concat_manifests(
[arr.manifest for arr in arrays],
# do concatenation of entries in manifest
concatenated_paths = np.concatenate(
[arr.manifest._paths for arr in arrays],
axis=axis,
)
concatenated_offsets = np.concatenate(
[arr.manifest._offsets for arr in arrays],
axis=axis,
)
concatenated_lengths = np.concatenate(
[arr.manifest._lengths for arr in arrays],
axis=axis,
)
concatenated_manifest = ChunkManifest.from_arrays(
paths=concatenated_paths,
offsets=concatenated_offsets,
lengths=concatenated_lengths,
)

new_zarray = ZArray(
chunks=first_arr.chunks,
compressor=first_arr.zarray.compressor,
dtype=first_arr.dtype,
fill_value=first_arr.zarray.fill_value,
filters=first_arr.zarray.filters,
shape=new_shape,
# TODO presumably these things should be checked for consistency across arrays too?
order=first_arr.zarray.order,
zarr_format=first_arr.zarray.zarr_format,
# chunk shape has not changed, there are just now more chunks along the concatenation axis
new_zarray = first_arr.zarray.replace(
shape=tuple(new_shape),
)

return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray)
Expand Down Expand Up @@ -210,26 +216,33 @@ def stack(
new_shape = list(first_shape)
new_shape.insert(axis, length_along_new_stacked_axis)

stacked_manifest = stack_manifests(
[arr.manifest for arr in arrays],
# do stacking of entries in manifest
stacked_paths = np.stack(
[arr.manifest._paths for arr in arrays],
axis=axis,
)
stacked_offsets = np.stack(
[arr.manifest._offsets for arr in arrays],
axis=axis,
)
stacked_lengths = np.stack(
[arr.manifest._lengths for arr in arrays],
axis=axis,
)
stacked_manifest = ChunkManifest.from_arrays(
paths=stacked_paths,
offsets=stacked_offsets,
lengths=stacked_lengths,
)

# chunk size has changed because a length-1 axis has been inserted
# chunk shape has changed because a length-1 axis has been inserted
old_chunks = first_arr.chunks
new_chunks = list(old_chunks)
new_chunks.insert(axis, 1)

new_zarray = ZArray(
chunks=new_chunks,
compressor=first_arr.zarray.compressor,
dtype=first_arr.dtype,
fill_value=first_arr.zarray.fill_value,
filters=first_arr.zarray.filters,
shape=new_shape,
# TODO presumably these things should be checked for consistency across arrays too?
order=first_arr.zarray.order,
zarr_format=first_arr.zarray.zarr_format,
new_zarray = first_arr.zarray.replace(
chunks=tuple(new_chunks),
shape=tuple(new_shape),
)

return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray)
Expand All @@ -254,74 +267,65 @@ def expand_dims(x: "ManifestArray", /, *, axis: int = 0) -> "ManifestArray":
@implements(np.broadcast_to)
def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArray":
"""
Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries.
Broadcasts a ManifestArray to a specified shape, by either adjusting chunk keys or copying chunk manifest entries.
"""

if len(x.shape) > len(shape):
raise ValueError("input operand has more dimensions than allowed")

# numpy broadcasting algorithm requires us to start by comparing the length of the final axes and work backwards
result = x
for axis, d, d_requested in itertools.zip_longest(
reversed(range(len(shape))), reversed(x.shape), reversed(shape), fillvalue=None
):
# len(shape) check above ensures this can't be type None
d_requested = cast(int, d_requested)

if d == d_requested:
pass
elif d is None:
if result.shape == ():
# scalars are a special case because their manifests already have a chunk key with one dimension
# see https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282
result = _broadcast_scalar(result, new_axis_length=d_requested)
else:
# stack same array upon itself d_requested number of times, which inserts a new axis at axis=0
result = stack([result] * d_requested, axis=0)
elif d == 1:
# concatenate same array upon itself d_requested number of times along existing axis
result = concatenate([result] * d_requested, axis=axis)
else:
raise ValueError(
f"Array with shape {x.shape} cannot be broadcast to shape {shape}"
)

return result


def _broadcast_scalar(x: "ManifestArray", new_axis_length: int) -> "ManifestArray":
"""
Add an axis to a scalar ManifestArray, but without adding a new axis to the keys of the chunk manifest.
from .array import ManifestArray

This is not the same as concatenation, because there is no existing axis along which to concatenate.
It's also not the same as stacking, because we don't want to insert a new axis into the chunk keys.
new_shape = shape

Scalars are a special case because their manifests still have a chunk key with one dimension.
See https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282
"""
# check its actually possible to broadcast to this new shape
mutually_broadcastable_shape = np.broadcast_shapes(x.shape, new_shape)
if mutually_broadcastable_shape != new_shape:
# we're not trying to broadcast both shapes to a third shape
raise ValueError(
f"array of shape {x.shape} cannot be broadcast to shape {new_shape}"
)

from .array import ManifestArray
# new chunk_shape is old chunk_shape with singleton dimensions pre-pended
# (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements)
old_chunk_shape = x.chunks
new_chunk_shape = _prepend_singleton_dimensions(
old_chunk_shape, ndim=len(new_shape)
)

new_shape = (new_axis_length,)
new_chunks = (new_axis_length,)
# find new chunk grid shape by dividing new array shape by new chunk shape
new_chunk_grid_shape = tuple(
ceildiv(axis_length, chunk_length)
for axis_length, chunk_length in zip(new_shape, new_chunk_shape)
)

concatenated_manifest = concat_manifests(
[x.manifest] * new_axis_length,
axis=0,
# do broadcasting of entries in manifest
broadcasted_paths = np.broadcast_to(
x.manifest._paths,
shape=new_chunk_grid_shape,
)
broadcasted_offsets = np.broadcast_to(
x.manifest._offsets,
shape=new_chunk_grid_shape,
)
broadcasted_lengths = np.broadcast_to(
x.manifest._lengths,
shape=new_chunk_grid_shape,
)
broadcasted_manifest = ChunkManifest.from_arrays(
paths=broadcasted_paths,
offsets=broadcasted_offsets,
lengths=broadcasted_lengths,
)

new_zarray = ZArray(
chunks=new_chunks,
compressor=x.zarray.compressor,
dtype=x.dtype,
fill_value=x.zarray.fill_value,
filters=x.zarray.filters,
new_zarray = x.zarray.replace(
chunks=new_chunk_shape,
shape=new_shape,
order=x.zarray.order,
zarr_format=x.zarray.zarr_format,
)

return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray)
return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray)


def _prepend_singleton_dimensions(shape: tuple[int, ...], ndim: int) -> tuple[int, ...]:
"""Prepend as many new length-1 axes to shape as necessary such that the result has ndim number of axes."""
n_prepended_dims = ndim - len(shape)
return tuple([1] * n_prepended_dims + list(shape))


# TODO broadcast_arrays, squeeze, permute_dims
Expand Down
Loading

0 comments on commit c4d4325

Please sign in to comment.