-
Notifications
You must be signed in to change notification settings - Fork 26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Chunk-aligned indexing #183
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import warnings | ||
from types import EllipsisType, NoneType | ||
from typing import Any, Callable, Union | ||
|
||
import numpy as np | ||
|
@@ -193,36 +194,85 @@ def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray": | |
|
||
def __getitem__( | ||
self, | ||
key, | ||
key: Union[ | ||
int, | ||
slice, | ||
EllipsisType, | ||
None, | ||
tuple[Union[int, slice, EllipsisType, None], ...], | ||
np.ndarray, | ||
], | ||
/, | ||
) -> "ManifestArray": | ||
""" | ||
Only supports extremely limited indexing. | ||
Only supports indexing where slices are aligned with chunk boundaries. | ||
|
||
Only here because xarray will apparently attempt to index into its lazy indexing classes even if the operation would be a no-op anyway. | ||
Follows the array API standard otherwise. | ||
""" | ||
from xarray.core.indexing import BasicIndexer | ||
indexer = key | ||
|
||
if isinstance(key, BasicIndexer): | ||
indexer = key.tuple | ||
indexer_nd: tuple[Union[int, slice, EllipsisType, None, np.ndarray], ...] | ||
if isinstance(indexer, (int, slice, EllipsisType, NoneType, np.ndarray)): | ||
indexer_nd = (indexer,) | ||
else: | ||
indexer = key | ||
indexer_nd = indexer | ||
|
||
indexer = _possibly_expand_trailing_ellipsis(key, self.ndim) | ||
# _validate_indexer(indexer) | ||
|
||
if len(indexer) != self.ndim: | ||
indexer_nd = _possibly_expand_trailing_ellipses(indexer_nd, self.ndim) | ||
|
||
if len(indexer_nd) != self.ndim: | ||
raise ValueError( | ||
f"Invalid indexer for array with ndim={self.ndim}: {indexer}" | ||
f"Invalid indexer for array with ndim={self.ndim}: {indexer_nd}" | ||
) | ||
|
||
if all( | ||
isinstance(axis_indexer, slice) and axis_indexer == slice(None) | ||
for axis_indexer in indexer | ||
chunk_slices = [] | ||
new_arr_shape = [] | ||
for axis_num, (indexer_1d, arr_length, chunk_length) in enumerate( | ||
zip(indexer_nd, self.shape, self.chunks) | ||
): | ||
# indexer is all slice(None)'s, so this is a no-op | ||
return self | ||
else: | ||
raise NotImplementedError(f"Doesn't support slicing with {indexer}") | ||
if isinstance(indexer_1d, int): | ||
array_slice_1d = slice(indexer_1d, indexer_1d + 1, 1) | ||
elif isinstance(indexer_1d, NoneType): | ||
array_slice_1d = slice(0, arr_length, 1) | ||
elif isinstance(indexer_1d, slice): | ||
array_slice_1d = slice( | ||
indexer_1d.start if indexer_1d.start is not None else 0, | ||
indexer_1d.stop if indexer_1d.stop is not None else arr_length, | ||
indexer_1d.step if indexer_1d.step is not None else 1, | ||
) | ||
else: | ||
# TODO we could attempt to also support indexing with numpy arrays | ||
raise TypeError( | ||
f"Can only perform indexing with keys of type (int, slice, EllipsisType, NoneType), but got type {type(indexer_1d)} for axis {axis_num}" | ||
) | ||
|
||
chunk_slice_1d = _array_slice_to_chunk_slice( | ||
array_slice_1d, arr_length, chunk_length | ||
) | ||
chunk_slices.append(chunk_slice_1d) | ||
|
||
n_elements_in_slice = abs( | ||
(array_slice_1d.start - array_slice_1d.stop) / array_slice_1d.step | ||
) | ||
new_arr_shape.append(n_elements_in_slice) | ||
|
||
print(chunk_slices) | ||
|
||
# do slicing of entries in manifest | ||
sliced_paths = self.manifest._paths[tuple(chunk_slices)] | ||
sliced_offsets = self.manifest._offsets[tuple(chunk_slices)] | ||
sliced_lengths = self.manifest._lengths[tuple(chunk_slices)] | ||
sliced_manifest = ChunkManifest.from_arrays( | ||
paths=sliced_paths, | ||
offsets=sliced_offsets, | ||
lengths=sliced_lengths, | ||
) | ||
|
||
# chunk sizes are unchanged by slicing that aligns with chunks | ||
new_zarray = self.zarray.replace(shape=tuple(new_arr_shape)) | ||
|
||
return ManifestArray(chunkmanifest=sliced_manifest, zarray=new_zarray) | ||
|
||
def rename_paths( | ||
self, | ||
|
@@ -265,10 +315,47 @@ def rename_paths( | |
return ManifestArray(zarray=self.zarray, chunkmanifest=renamed_manifest) | ||
|
||
|
||
def _possibly_expand_trailing_ellipsis(key, ndim: int): | ||
if key[-1] == ...: | ||
extra_slices_needed = ndim - (len(key) - 1) | ||
*indexer, ellipsis = key | ||
return tuple(tuple(indexer) + (slice(None),) * extra_slices_needed) | ||
def _array_slice_to_chunk_slice( | ||
array_slice: slice, | ||
arr_length: int, | ||
chunk_length: int, | ||
) -> slice: | ||
""" | ||
Translate a slice into an array into a corresponding slice into the underlying chunk grid. | ||
Comment on lines
+323
to
+329
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
Will raise on any array slices that require slicing within individual chunks. | ||
""" | ||
|
||
if chunk_length == 1: | ||
# alot of indexing is possible only in this case, because this is basically just a normal array along that axis | ||
chunk_slice = array_slice | ||
return chunk_slice | ||
Comment on lines
+334
to
+337
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kind of interesting to realise this. You can even do things like |
||
|
||
# Check that start of slice aligns with start of a chunk | ||
if array_slice.start % chunk_length != 0: | ||
raise NotImplementedError( | ||
f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" | ||
) | ||
|
||
# Check that slice spans integer number of chunks | ||
slice_length = array_slice.stop - array_slice.start | ||
if slice_length % chunk_length != 0: | ||
raise NotImplementedError( | ||
f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" | ||
) | ||
|
||
index_of_first_chunk = int(array_slice.start / chunk_length) | ||
n_chunks = int(slice_length / chunk_length) | ||
|
||
chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) | ||
|
||
return chunk_slice | ||
|
||
|
||
def _possibly_expand_trailing_ellipses(indexer, ndim: int): | ||
if indexer[-1] == ...: | ||
extra_slices_needed = ndim - (len(indexer) - 1) | ||
*indexer_1d, ellipsis = indexer | ||
return tuple(tuple(indexer_1d) + (slice(None),) * extra_slices_needed) | ||
else: | ||
return key | ||
return indexer |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -337,3 +337,26 @@ def test_refuse_combine(): | |
for func in [np.concatenate, np.stack]: | ||
with pytest.raises(ValueError, match="inconsistent dtypes"): | ||
func([marr1, marr2], axis=0) | ||
|
||
|
||
class TestIndexing: | ||
def test_slice_aligned_with_chunks(self): | ||
marr = create_manifestarray(shape=(4,), chunks=(2,)) | ||
marr[0:2] | ||
marr[2:4] | ||
marr[0:4] | ||
Comment on lines
+344
to
+347
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to come back and improve these placeholder tests |
||
|
||
with pytest.raises( | ||
NotImplementedError, match="slice would split individual chunks" | ||
): | ||
marr[0] | ||
|
||
with pytest.raises( | ||
NotImplementedError, match="slice would split individual chunks" | ||
): | ||
marr[0:1] | ||
|
||
with pytest.raises( | ||
NotImplementedError, match="slice would split individual chunks" | ||
): | ||
marr[0:3] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(probably don't need this as separate function)