From 6fef91c3134a4241faff9675c63963a9133db1cf Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 4 Apr 2024 20:00:17 -0400 Subject: [PATCH] Load selected variables instead of making them virtual (#69) * load selected variables instead of making them virtual * test * fix test by explicitly handling encouters of IndexVariable objects * nits * load_variables->loadable_variables * documentation --- docs/usage.md | 34 ++++++++ virtualizarr/tests/test_xarray.py | 18 ++++ virtualizarr/xarray.py | 135 ++++++++++++++++++++++++------ 3 files changed, 160 insertions(+), 27 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 1d542ba8..53228b43 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -277,6 +277,36 @@ TODO: Reinstate this part of the docs once [GH issue #18](https://github.com/Tom TODO: Use preprocess to create a new index from the metadata +## Loading variables + +Whilst the values of virtual variables (i.e. those backed by `ManifestArray` objects) cannot be loaded into memory, you do have the option of opening specific variables from the file as loadable lazy numpy/dask arrays, just like `xr.open_dataset` normally returns. These variables are specified using the `loadable_variables` argument: + +```python +vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time']) +``` +```python + Size: 31MB +Dimensions: (time: 2920, lat: 25, lon: 53) +Coordinates: + lat (lat) float32 100B ManifestArray xr.Dataset: @@ -41,6 +43,9 @@ def open_virtual_dataset( If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. drop_variables: list[str], default is None Variables in the file to drop before returning. + loadable_variables: list[str], default is None + Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class. + Default is to open all variables as virtual arrays (i.e. ManifestArray). indexes : Mapping[str, Index], default is None Indexes to use on the returned xarray Dataset. Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes. @@ -48,27 +53,75 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + + Returns + ------- + vds + An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables. """ + if drop_variables is None: + drop_variables = [] + elif isinstance(drop_variables, str): + drop_variables = [drop_variables] + else: + drop_variables = list(drop_variables) + if loadable_variables is None: + loadable_variables = [] + elif isinstance(loadable_variables, str): + loadable_variables = [loadable_variables] + else: + loadable_variables = list(loadable_variables) + common = set(drop_variables).intersection(set(loadable_variables)) + if common: + raise ValueError(f"Cannot both load and drop variables {common}") + # this is the only place we actually always need to use kerchunk directly + # TODO avoid even reading byte ranges for variables that will be dropped later anyway? vds_refs = kerchunk.read_kerchunk_references_from_file( filepath=filepath, filetype=filetype, ) + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - if indexes is None: - # add default indexes by reading data from file + if indexes is None or len(loadable_variables) > 0: # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - ds = xr.open_dataset(filepath) - indexes = ds.xindexes - ds.close() + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + ds = xr.open_dataset(filepath, drop_variables=drop_variables) + + if indexes is None: + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation - vds = dataset_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables, - virtual_array_class=virtual_array_class, - indexes=indexes, + loadable_vars = {name: var for name, var in ds.variables.items() if name in loadable_variables} + + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} + + vars = {**virtual_vars, **loadable_vars} + + data_vars, coords = separate_coords(vars, indexes) + + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, ) # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened @@ -76,14 +129,13 @@ def open_virtual_dataset( return vds -def dataset_from_kerchunk_refs( +def virtual_vars_from_kerchunk_refs( refs: KerchunkStoreRefs, drop_variables: Optional[List[str]] = None, virtual_array_class=ManifestArray, - indexes={}, -) -> xr.Dataset: +) -> Mapping[str, xr.Variable]: """ - Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. + Translate a store-level kerchunk reference dict into aa set of xarray Variables containing virtualized arrays. drop_variables: list[str], default is None Variables in the file to drop before returning. @@ -99,24 +151,44 @@ def dataset_from_kerchunk_refs( var_name for var_name in var_names if var_name not in drop_variables ] - vars = {} - for var_name in var_names_to_keep: - vars[var_name] = variable_from_kerchunk_refs( + vars = {var_name: variable_from_kerchunk_refs( refs, var_name, virtual_array_class - ) + ) for var_name in var_names_to_keep} + + return vars + + + +def dataset_from_kerchunk_refs( + refs: KerchunkStoreRefs, + drop_variables: Optional[List[str]] = None, + virtual_array_class=ManifestArray, + indexes={}, +) -> xr.Dataset: + """ + Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. + + drop_variables: list[str], default is None + Variables in the file to drop before returning. + virtual_array_class + Virtual array class to use to represent the references to the chunks in each on-disk array. + Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + """ + + vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) data_vars, coords = separate_coords(vars, indexes) ds_attrs = kerchunk.fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) - ds = xr.Dataset( + vds = xr.Dataset( data_vars, coords=coords, # indexes={}, # TODO should be added in a later version of xarray attrs=ds_attrs, ) - return ds + return vds def variable_from_kerchunk_refs( @@ -134,13 +206,15 @@ def variable_from_kerchunk_refs( def separate_coords( - vars: dict[str, xr.Variable], - indexes={}, -) -> tuple[dict[str, xr.Variable], xr.Coordinates]: + vars: Mapping[str, xr.Variable], + indexes: MutableMapping[str, Index], +) -> tuple[Mapping[str, xr.Variable], xr.Coordinates]: """ Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. - I thought this should be easy but it was actually really hard - in the end I had to checkout xarray v2023.08.0, the last one before #8107 was merged. + Currently requires a workaround unless xarray 8107 is merged. + + Will also preserve any loaded variables and indexes it is passed. """ # this would normally come from CF decoding, let's hope the fact we're skipping that doesn't cause any problems... @@ -155,6 +229,13 @@ def separate_coords( if len(var.dims) == 1: dim1d, *_ = var.dims coord_vars[name] = (dim1d, var.data) + + if isinstance(var, IndexVariable): + # unless variable actually already is a loaded IndexVariable, + # in which case we need to keep it and add the corresponding indexes explicitly + coord_vars[name] = var + # TODO this seems suspect - will it handle datetimes? + indexes[name] = PandasIndex(var, dim1d) else: coord_vars[name] = var else: