diff --git a/CHANGES.rst b/CHANGES.rst index 363d36e67..8bdb893f5 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -20,6 +20,7 @@ Bug fixes * Coincidentally, this also fixes an error that caused `pytest` to error-out when invoked without an active internet connection. Running `pytest` without network access is now supported (requires cached testing data). (:issue:`1468`). * Calling a ``sdba.map_blocks``-wrapped function with data chunked along the reduced dimensions will raise an error. This forbids chunking the trained dataset along the distribution dimensions, for example. (:issue:`1481`, :pull:`1482`). * Optimization of indicators ``huglin_index`` and ``biologically_effective_degree_days`` when used with dask and flox. As a side effect, the indice functions (i.e. under ``xc.indices``) no longer mask incomplete periods. The indicators' output is unchanged under the default "check_missing" setting (:issue:`1494`, :pull:`1495`). +* Fixed ``xclim.indices.run_length.lazy_indexing`` which would sometimes trigger the loading of auxiliary coordinates. (:issue:`1483`, :pull:`1484`). Breaking changes ^^^^^^^^^^^^^^^^ @@ -39,6 +40,7 @@ Internal changes * Mastodon publishing now uses `chuhlomin/render-template `_ and a standard formatting markdown document to format Mastodon toots. (:pull:`1469`). * GitHub testing workflows now use `Concurrency` instead of the styfle/cancel-workflow-action to cancel redundant workflows. (:pull:`1487`). * The `pkg_resources` library has been replaced for the `packaging` library when version comparisons have been performed, and a few warning messages have been silenced in the testing suite. (:issue:`1489`, :pull:`1490`). +* New ``xclim.testing.helpers.assert_lazy`` context manager to assert the laziness of code blocks. (:pull:`1484`). v0.45.0 (2023-09-05) -------------------- diff --git a/tests/test_run_length.py b/tests/test_run_length.py index 74b726571..0e62653f7 100644 --- a/tests/test_run_length.py +++ b/tests/test_run_length.py @@ -10,6 +10,7 @@ from xclim.core.options import set_options from xclim.indices import run_length as rl +from xclim.testing.helpers import assert_lazy K2C = 273.15 @@ -654,6 +655,7 @@ def test_too_many_dates(self, func, tas_series): @pytest.mark.parametrize("use_dask", [True, False]) def test_lazy_indexing(use_dask): idx = xr.DataArray([[0, 10], [33, 99]], dims=("x", "y")) + idx = idx.assign_coords(x2=idx.x**2) da = xr.DataArray(np.arange(100), dims=("time",)) db = xr.DataArray(-np.arange(100), dims=("time",)) @@ -661,7 +663,10 @@ def test_lazy_indexing(use_dask): idx = idx.chunk({"x": 1}) # Ensure tasks are different - outa, outb = compute(rl.lazy_indexing(da, idx), rl.lazy_indexing(db, idx)) + with assert_lazy: + outa = rl.lazy_indexing(da, idx) + outb = rl.lazy_indexing(db, idx) + outa, outb = compute(outa, outb) assert set(outa.dims) == {"x", "y"} np.testing.assert_array_equal(idx, outa) diff --git a/xclim/indices/run_length.py b/xclim/indices/run_length.py index 7ce29a8e9..c05c66010 100644 --- a/xclim/indices/run_length.py +++ b/xclim/indices/run_length.py @@ -1345,15 +1345,22 @@ def _index_from_1d_array(indices, array): # The 0-D index case, we add a dummy dimension to help dask dim = get_temp_dimname(da.dims, "x") index = index.expand_dims(dim) - invalid = index.isnull() # Which indexes to mask + # Which indexes to mask. + invalid = index.isnull() # NaN-indexing doesn't work, so fill with 0 and cast to int index = index.fillna(0).astype(int) # for each chunk of index, take corresponding values from da da2 = da.rename("__placeholder__") out = index.map_blocks(_index_from_1d_array, args=(da2,)).rename(da.name) - # mask where index was NaN - out = out.where(~invalid) + # mask where index was NaN. Drop any auxiliary coord, they are already on `out`. + # Chunked aux coord would have the same name on both sides and xarray will want to check if they are equal, which means loading them + # making lazy_indexing not lazy. + out = out.where( + ~invalid.drop_vars( + [crd for crd in invalid.coords if crd not in invalid.dims] + ) + ) if idx_ndim == 0: # 0-D case, drop useless coords and dummy dim out = out.drop_vars(da.dims[0]).squeeze() diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index e87894edf..abc2c0681 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd import xarray as xr +from dask.diagnostics import Callback from yaml import safe_load from xclim.core import calendar @@ -66,6 +67,7 @@ "PREFETCH_TESTING_DATA", "TESTDATA_BRANCH", "add_example_file_paths", + "assert_lazy", "generate_atmos", "populate_testing_data", "test_timeseries", @@ -253,3 +255,14 @@ def test_timeseries( return da.to_dataset() else: return da + + +def _raise_on_compute(dsk: dict): + """Raise an AssertionError mentionning the number triggered tasks.""" + raise AssertionError( + f"Not lazy. Computation was triggered with a graph of {len(dsk)} tasks." + ) + + +assert_lazy = Callback(start=_raise_on_compute) +"""Context manager that raises an AssertionError if any dask computation is triggered."""