diff --git a/changelog/278.fix.md b/changelog/278.fix.md new file mode 100644 index 0000000..585a5a8 --- /dev/null +++ b/changelog/278.fix.md @@ -0,0 +1 @@ +Fix problems with downscaling where data is zero for all basket contents. diff --git a/primap2/_downscale.py b/primap2/_downscale.py index 721ed77..530b1a5 100644 --- a/primap2/_downscale.py +++ b/primap2/_downscale.py @@ -1,5 +1,7 @@ from collections.abc import Hashable, Sequence +import numpy as np +import pandas as pd import xarray as xr from ._accessor_base import BaseDataArrayAccessor, BaseDatasetAccessor @@ -37,6 +39,11 @@ def downscale_timeseries( used to downscale the basket to its contents, which is used to fill gaps in the timeseries of the basket contents. + If the data to downscale contains only 0 and NaN the result will be all + zero timeseries. If the data is only for individual basket and basket_content + values, the resulting downscaled data points are zero but the shares for other years + are not influenced + Parameters ---------- dim: str @@ -104,6 +111,35 @@ def downscale_timeseries( "To continue regardless, set check_consistency=False." ) + # treat zeros + basket_both_zero = basket_da.where((basket_da == 0) & (basket_sum == 0)) + basket_sum_zero = basket_da.where((basket_da != 0) & (basket_sum == 0)) + if not basket_sum_zero.isnull().all(): + # this can only happen if check consistency = False, + # so we could also remove it and say whoever switches it of has + # to deal with the consequences + error_message = generate_error_message(basket_sum_zero) + raise ValueError(f"pr.downscale_timeseries {error_message}") + + any_nonzero = basket_sum.where(basket_sum != 0).notnull().any() + + # treat the case where all data is zero or NaN + if (not basket_both_zero.isnull().all()) & (not any_nonzero): + # create fake basket_contents_da and basket_sum which contain + # 1 for all data points where NaNs are either in the sum or the basket + # this will later lead to equal shares for the NaNs. + units = basket_contents_da.pint.units + basket_da_squeeze = basket_da.drop(dim) + basket_contents_da = basket_contents_da.where( + (basket_da_squeeze != 0) | (basket_sum != 0), 1 * units + ) + basket_sum = basket_contents_da.pr.sum( + dim=dim, + skipna=skipna, + min_count=1, + skipna_evaluation_dims=skipna_evaluation_dims, + ) + # inter- and extrapolate shares: xr.DataArray = ( (basket_contents_da / basket_sum) @@ -114,6 +150,11 @@ def downscale_timeseries( .bfill(dim="time") ) + # treat the case where there are zero values in basket_sum and basket_da but also + # non-zero data + if (not basket_both_zero.isnull().all()) & any_nonzero: + shares = shares.where((basket_da != 0) | (basket_sum != 0), 0) + downscaled: xr.DataArray = basket_da * shares return self._da.fillna(downscaled) @@ -146,6 +187,11 @@ def downscale_timeseries( used to downscale the basket to its contents, which is used to fill gaps in the timeseries of the basket contents. + If the data to downscale contains only 0 an NaN the result will be all + zero timeseries. If the data is only for individual basket and basket_content + values, the resulting downscaled data points are zero but the shares for other years + are not influenced + Parameters ---------- dim: str @@ -193,51 +239,20 @@ def downscale_timeseries( "Use ds.pr.remove_processing_info()." ) - ds_sel = select_no_scalar_dimension(self._ds, sel) - - basket_contents_ds = ds_sel.loc[{dim: basket_contents}] - basket_ds = ds_sel.loc[{dim: basket}] - - if skipna_evaluation_dims is not None: - if skipna: - raise ValueError( - "Only one of 'skipna' and 'skipna_evaluation_dims' may be supplied, not" - " both." - ) - else: - skipna = None - - basket_sum = basket_contents_ds.pr.sum( - dim=dim, - skipna=skipna, - min_count=1, - skipna_evaluation_dims=skipna_evaluation_dims, - ) - - if check_consistency: - deviation = abs(basket_ds / basket_sum - 1) - devmax = deviation.to_array().max() - if devmax > tolerance: - raise ValueError( - f"Sum of the basket_contents {basket_contents!r} deviates" - f" {devmax * 100} % from the basket" - f" {basket!r}, which is more than the allowed 1 %. " - "To continue regardless, set check_consistency=False." - ) - - # inter- and extrapolate - shares: xr.Dataset = ( - (basket_contents_ds / basket_sum) - .pint.to({x: "" for x in basket_contents_ds.keys()}) - .pint.dequantify() - .interpolate_na(dim="time", method="linear") - .ffill(dim="time") - .bfill(dim="time") - ) - - downscaled: xr.Dataset = basket_ds * shares + downscaled = self._ds.copy() + for var in self._ds.data_vars: + downscaled[var] = downscaled[var].pr.downscale_timeseries( + dim=dim, + basket=basket, + basket_contents=basket_contents, + check_consistency=check_consistency, + sel=sel, + skipna_evaluation_dims=skipna_evaluation_dims, + skipna=skipna, + tolerance=tolerance, + ) - return self._ds.fillna(downscaled) + return downscaled def downscale_gas_timeseries( self, @@ -265,6 +280,11 @@ def downscale_gas_timeseries( converted back afterwards; for both conversions the gwp conversions of the basket are used. + If the data to downscale contains only 0 an NaN the result will be all + zero timeseries. If the data is only for individual basket and basket_content + values, the resulting downscaled data points are zero but the shares for other years + are not influenced + Parameters ---------- basket: str @@ -306,10 +326,10 @@ def downscale_gas_timeseries( ds_sel = select_no_scalar_dimension(self._ds, sel) basket_contents_converted = xr.Dataset() - da_basket = ds_sel[basket] + basket_da = ds_sel[basket] for var in basket_contents: da: xr.DataArray = ds_sel[var] - basket_contents_converted[var] = da.pr.convert_to_gwp_like(like=da_basket) + basket_contents_converted[var] = da.pr.convert_to_gwp_like(like=basket_da) if skipna_evaluation_dims is not None: if skipna: @@ -328,7 +348,7 @@ def downscale_gas_timeseries( ) if check_consistency: - deviation = abs(da_basket / basket_sum - 1) + deviation = abs(basket_da / basket_sum - 1) devmax = deviation.max().item() if devmax > tolerance: raise ValueError( @@ -340,6 +360,31 @@ def downscale_gas_timeseries( "To continue regardless, set check_consistency=False." ) + # treat zeros + basket_both_zero = basket_da.where((basket_da == 0) & (basket_sum == 0)) + basket_sum_zero = basket_da.where((basket_da != 0) & (basket_sum == 0)) + if not basket_sum_zero.isnull().all(): + # this can only happen if check consistency = False, + # so we could also remove it and say whoever switches it of has + # to deal with the consequences + error_message = generate_error_message(basket_sum_zero) + raise ValueError(f"pr.downscale_gas_timeseries {error_message}") + + any_nonzero = basket_sum.where(basket_sum != 0).notnull().any() + + # treat the case where all data is zero or NaN + if (not basket_both_zero.isnull().all()) & (not any_nonzero): + unit_content = str(basket_da.pint.units) + basket_contents_converted = basket_contents_converted.where( + (basket_da != 0) | (basket_sum != 0), 1 * ureg(unit_content) + ) + basket_sum = basket_contents_converted.pr.sum( + dim="entity", + skipna=skipna, + min_count=1, + skipna_evaluation_dims=skipna_evaluation_dims, + ) + # inter- and extrapolate shares = ( (basket_contents_converted / basket_sum) @@ -350,14 +395,55 @@ def downscale_gas_timeseries( .bfill(dim="time") ) - downscaled = da_basket * shares + # treat the case where there are zero values in basket_sum and basket_da but also + # non-zero data + if (not basket_both_zero.isnull().all()) & any_nonzero: + shares = shares.where((basket_da != 0) | (basket_sum != 0), 0) + + downscaled = basket_da * shares downscaled_converted = xr.Dataset() - with ureg.context(da_basket.attrs["gwp_context"]): + with ureg.context(basket_da.attrs["gwp_context"]): for var in basket_contents: downscaled_converted[var] = ds_sel[var].fillna( downscaled[var].pint.to(ds_sel[var].pint.units) ) return self._ds.pr.fillna(downscaled_converted) + + +def generate_error_message(da_error: xr.DataArray) -> str: + """Generate error message for zero sum data. + + based on generate_log_message for pr.merge + Strategy: + + * remove all length-1 dimensions and put them in description + * convert the rest into a pandas dataframe for nice printing + """ + scalar_dims = [dim for dim in da_error.dims if len(da_error[dim]) == 1] + scalar_dims_format = [] + for dim in scalar_dims: + if pd.api.types.is_datetime64_any_dtype(da_error[dim]): + ts = pd.Timestamp(da_error[dim][0].values) + # optimization for the common case where we have data on a yearly or + # more coarse basis + if ts == pd.Timestamp(year=ts.year, month=1, day=1): + scalar_dims_format.append(f"{dim}={ts.strftime('%Y')}") + else: + scalar_dims_format.append(f"{dim}={ts!s}") + else: + scalar_dims_format.append(f"{dim}={da_error[dim].item()}") + scalar_dims_str = ", ".join(scalar_dims_format) + da_error_dequ = da_error.squeeze(drop=True).pint.dequantify() + if np.ndim(da_error_dequ.data) == 0: + errors_str = "" + else: + errors_str = da_error_dequ.to_dataframe().dropna().to_string() + + return ( + "error: found zero basket content sum for non-zero basket data" + f" for {scalar_dims_str}:\n" + f"({da_error.name})\n" + errors_str + ) diff --git a/primap2/tests/test_downscale.py b/primap2/tests/test_downscale.py index ca2f59b..9c39a06 100644 --- a/primap2/tests/test_downscale.py +++ b/primap2/tests/test_downscale.py @@ -10,7 +10,8 @@ from .utils import allclose, assert_equal -def test_downscale_gas_timeseries(empty_ds): +@pytest.fixture +def gas_downscaling_ds(empty_ds): for key in empty_ds: empty_ds[key].pint.magnitude[:] = np.nan empty_ds["CO2"].loc[{"time": "2002"}] = 1 * ureg("Gg CO2 / year") @@ -22,40 +23,11 @@ def test_downscale_gas_timeseries(empty_ds): empty_ds["KYOTOGHG (AR4GWP100)"].loc[{"time": "2020"}] = ( 2 * (1 + sf6 + ch4) * ureg("Gg CO2 / year") ) - - downscaled = empty_ds.pr.downscale_gas_timeseries( - basket="KYOTOGHG (AR4GWP100)", basket_contents=["CO2", "SF6", "CH4"] - ) - expected = empty_ds.copy() - expected["CO2"][:] = 1 * ureg("Gg CO2 / year") - expected["SF6"][:] = 1 * ureg("Gg SF6 / year") - expected["CH4"][:] = 1 * ureg("Gg CH4 / year") - expected["CO2"].loc[{"time": "2020"}] = 2 * ureg("Gg CO2 / year") - expected["SF6"].loc[{"time": "2020"}] = 2 * ureg("Gg SF6 / year") - expected["CH4"].loc[{"time": "2020"}] = 2 * ureg("Gg CH4 / year") - - xr.testing.assert_identical(downscaled, expected) - - with pytest.raises( - ValueError, - match="Only one of 'skipna' and 'skipna_evaluation_dims' may be supplied, not both.", - ): - empty_ds.pr.downscale_gas_timeseries( - basket="KYOTOGHG (AR4GWP100)", - basket_contents=["CO2", "SF6", "CH4"], - skipna_evaluation_dims=["time"], - skipna=True, - ) - - empty_ds["SF6"].loc[{"time": "2002"}] = 2 * ureg("Gg SF6 / year") - - with pytest.raises(ValueError, match="To continue regardless, set check_consistency=False"): - empty_ds.pr.downscale_gas_timeseries( - basket="KYOTOGHG (AR4GWP100)", basket_contents=["CO2", "SF6", "CH4"] - ) + return empty_ds -def test_downscale_timeseries(empty_ds): +@pytest.fixture +def dim_downscaling_ds(empty_ds): for key in empty_ds: empty_ds[key].pint.magnitude[:] = np.nan t = empty_ds.loc[{"area (ISO3)": "BOL"}].copy() @@ -74,11 +46,17 @@ def test_downscale_timeseries(empty_ds): da.loc[{"area (ISO3)": "CAMB", "source": "RAND2020"}] = np.concatenate( [np.array([6] * 11), np.stack([8, 8]), np.linspace(8, 10, 8)] ) * ureg("Gg CO2 / year") + return ds + + +@pytest.fixture +def dim_downscaling_da(dim_downscaling_ds): + return dim_downscaling_ds["CO2"] - downscaled = da.pr.downscale_timeseries( - dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"] - ) - expected = da.copy() + +@pytest.fixture +def dim_downscaling_expected_da(dim_downscaling_da): + expected = dim_downscaling_da.copy() expected.loc[{"area (ISO3)": ["COL", "ARG", "MEX"], "source": "RAND2020"}] = np.broadcast_to( np.concatenate( @@ -97,26 +75,66 @@ def test_downscale_timeseries(empty_ds): np.linspace(2, 2 * 10 / 8, 8), ] ) * ureg("Gg CO2 / year") + return expected + + +def test_downscale_gas_timeseries(gas_downscaling_ds): + downscaled = gas_downscaling_ds.pr.downscale_gas_timeseries( + basket="KYOTOGHG (AR4GWP100)", basket_contents=["CO2", "SF6", "CH4"] + ) + expected = gas_downscaling_ds.copy() + expected["CO2"][:] = 1 * ureg("Gg CO2 / year") + expected["SF6"][:] = 1 * ureg("Gg SF6 / year") + expected["CH4"][:] = 1 * ureg("Gg CH4 / year") + expected["CO2"].loc[{"time": "2020"}] = 2 * ureg("Gg CO2 / year") + expected["SF6"].loc[{"time": "2020"}] = 2 * ureg("Gg SF6 / year") + expected["CH4"].loc[{"time": "2020"}] = 2 * ureg("Gg CH4 / year") + + xr.testing.assert_identical(downscaled, expected) + + with pytest.raises( + ValueError, + match="Only one of 'skipna' and 'skipna_evaluation_dims' may be supplied, not both.", + ): + gas_downscaling_ds.pr.downscale_gas_timeseries( + basket="KYOTOGHG (AR4GWP100)", + basket_contents=["CO2", "SF6", "CH4"], + skipna_evaluation_dims=["time"], + skipna=True, + ) + + gas_downscaling_ds["SF6"].loc[{"time": "2002"}] = 2 * ureg("Gg SF6 / year") + + with pytest.raises(ValueError, match="To continue regardless, set check_consistency=False"): + gas_downscaling_ds.pr.downscale_gas_timeseries( + basket="KYOTOGHG (AR4GWP100)", basket_contents=["CO2", "SF6", "CH4"] + ) + + +def test_downscale_timeseries(dim_downscaling_ds, dim_downscaling_da, dim_downscaling_expected_da): + downscaled = dim_downscaling_da.pr.downscale_timeseries( + dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"] + ) # we need a higher atol, because downscale_timeseries actually does the # downscaling using a proper calendar while here we use a calendar where all years # have the same length. - assert_equal(downscaled, expected, equal_nan=True, atol=0.01) + assert_equal(downscaled, dim_downscaling_expected_da, equal_nan=True, atol=0.01) allclose( downscaled.loc[{"area (ISO3)": "CAMB"}], downscaled.loc[{"area (ISO3)": ["COL", "ARG", "MEX", "BOL"]}].sum(dim="area (ISO3)"), ) - downscaled_ds = ds.pr.downscale_timeseries( + downscaled_ds = dim_downscaling_ds.pr.downscale_timeseries( dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"] ) - assert_equal(downscaled_ds["CO2"], expected, equal_nan=True, atol=0.01) + assert_equal(downscaled_ds["CO2"], dim_downscaling_expected_da, equal_nan=True, atol=0.01) with pytest.raises( ValueError, match="Only one of 'skipna' and 'skipna_evaluation_dims' may be supplied, not both.", ): - ds.pr.downscale_timeseries( + dim_downscaling_ds.pr.downscale_timeseries( dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"], @@ -124,22 +142,22 @@ def test_downscale_timeseries(empty_ds): skipna=True, ) - da.loc[{"area (ISO3)": "BOL", "time": "2002"}] = 2 * ureg("Gg CO2 / year") + dim_downscaling_da.loc[{"area (ISO3)": "BOL", "time": "2002"}] = 2 * ureg("Gg CO2 / year") with pytest.raises(ValueError, match="To continue regardless, set check_consistency=False"): - da.pr.downscale_timeseries( + dim_downscaling_da.pr.downscale_timeseries( dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"], ) - downscaled = da.pr.downscale_timeseries( + downscaled = dim_downscaling_da.pr.downscale_timeseries( dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"], check_consistency=False, ) - expected = da.copy() + expected = dim_downscaling_da.copy() expected.loc[{"area (ISO3)": ["COL", "ARG", "MEX"], "source": "RAND2020"}] = np.broadcast_to( np.concatenate( @@ -161,14 +179,14 @@ def test_downscale_timeseries(empty_ds): assert_equal(downscaled, expected, equal_nan=True, atol=0.01) - downscaled = da.pr.downscale_timeseries( + downscaled = dim_downscaling_da.pr.downscale_timeseries( dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"], check_consistency=False, sel={"time": slice("2005", "2020")}, ) - expected = da.copy() + expected = dim_downscaling_da.copy() expected.loc[{"area (ISO3)": ["COL", "ARG", "MEX", "BOL"], "source": "RAND2020"}] = ( np.broadcast_to( @@ -201,3 +219,131 @@ def test_downscale_timeseries(empty_ds): expected.loc[{"area (ISO3)": "BOL", "time": "2002"}] = 2 * ureg("Gg CO2 / year") assert_equal(downscaled, expected, equal_nan=True, atol=0.01) + + +def test_downscale_timeseries_da_zero(dim_downscaling_da, dim_downscaling_expected_da): + dim_downscaling_da = dim_downscaling_da * 0 + + dim_downscaling_expected_da = dim_downscaling_expected_da * 0 + + downscaled = dim_downscaling_da.pr.downscale_timeseries( + dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"] + ) + + assert_equal(downscaled, dim_downscaling_expected_da, equal_nan=True) + allclose( + downscaled.loc[{"area (ISO3)": "CAMB"}], + downscaled.loc[{"area (ISO3)": ["COL", "ARG", "MEX", "BOL"]}].sum(dim="area (ISO3)"), + ) + + +def test_downscale_timeseries_da_partial_zero(dim_downscaling_da, dim_downscaling_expected_da): + dim_downscaling_da.loc[{"time": "2002"}] = dim_downscaling_da.loc[{"time": "2002"}] * 0 + + dim_downscaling_expected_da.loc[ + {"area (ISO3)": ["COL", "ARG", "MEX", "BOL"], "source": "RAND2020"} + ] = np.broadcast_to( + np.concatenate( + [ + np.array([1.5, 1.5]), + np.array([0]), + 1 / 4 * np.array([6] * 8 + [8] * 2), + np.linspace(2, 2 * 10 / 8, 8), + ] + ), + (4, 21), + ).T * ureg("Gg CO2 / year") + + dim_downscaling_expected_da.loc[ + {"area (ISO3)": ["CAMB"], "source": "RAND2020", "time": ["2002"]} + ] = 0 * ureg("Gg CO2 / year") + + downscaled = dim_downscaling_da.pr.downscale_timeseries( + dim="area (ISO3)", basket="CAMB", basket_contents=["COL", "ARG", "MEX", "BOL"] + ) + + assert_equal(downscaled, dim_downscaling_expected_da, equal_nan=True) + allclose( + downscaled.loc[{"area (ISO3)": "CAMB"}], + downscaled.loc[{"area (ISO3)": ["COL", "ARG", "MEX", "BOL"]}].sum(dim="area (ISO3)"), + ) + + +def test_downscale_timeseries_da_zero_sum_error(dim_downscaling_da): + dim_downscaling_da.loc[{"area (ISO3)": ["COL", "ARG", "MEX", "BOL"]}] = 0 * ureg( + str(dim_downscaling_da.pint.units) + ) + + with pytest.raises( + ValueError, + match=r"pr.downscale_timeseries error: found zero basket content sum for " + r"non-zero basket data for source=RAND2020:", + ): + dim_downscaling_da.pr.downscale_timeseries( + dim="area (ISO3)", + basket="CAMB", + basket_contents=["COL", "ARG", "MEX", "BOL"], + check_consistency=False, + ) + + +def test_downscale_gas_timeseries_zero(gas_downscaling_ds): + for var in gas_downscaling_ds.data_vars: + gas_downscaling_ds[var].data = gas_downscaling_ds[var].data * 0 + + downscaled = gas_downscaling_ds.pr.downscale_gas_timeseries( + basket="KYOTOGHG (AR4GWP100)", basket_contents=["CO2", "SF6", "CH4"] + ) + expected = gas_downscaling_ds.copy() + expected["CO2"][:] = 0 * ureg("Gg CO2 / year") + expected["SF6"][:] = 0 * ureg("Gg SF6 / year") + expected["CH4"][:] = 0 * ureg("Gg CH4 / year") + + xr.testing.assert_identical(downscaled, expected) + + +def test_downscale_gas_timeseries_zero_sum_error(gas_downscaling_ds): + basket_contents = ["CO2", "SF6", "CH4"] + + for var in basket_contents: + gas_downscaling_ds[var].loc[{"area (ISO3)": ["COL", "ARG", "MEX", "BOL"]}] = 0 * ureg( + str(gas_downscaling_ds[var].pint.units) + ) + + with pytest.raises( + ValueError, + match=r"pr.downscale_gas_timeseries error: found zero basket content " + r"sum for non-zero basket data for source=RAND2020:", + ): + gas_downscaling_ds.pr.downscale_gas_timeseries( + basket="KYOTOGHG (AR4GWP100)", + basket_contents=basket_contents, + check_consistency=False, + ) + + +def test_downscale_gas_timeseries_da_partial_zero(gas_downscaling_ds): + for var in ["CO2", "SF6", "CH4"]: + gas_downscaling_ds[var].loc[{"time": ["2010"]}] = 0 * gas_downscaling_ds[var].loc[ + {"time": ["2002"]} + ].squeeze(dim="time") + gas_downscaling_ds["KYOTOGHG (AR4GWP100)"].loc[{"time": ["2010"]}] = 0 * gas_downscaling_ds[ + "KYOTOGHG (AR4GWP100)" + ].loc[{"time": ["2002"]}].squeeze(dim="time") + + downscaled = gas_downscaling_ds.pr.downscale_gas_timeseries( + basket="KYOTOGHG (AR4GWP100)", basket_contents=["CO2", "SF6", "CH4"] + ) + + expected = gas_downscaling_ds.copy() + expected["CO2"][:] = 1 * ureg("Gg CO2 / year") + expected["SF6"][:] = 1 * ureg("Gg SF6 / year") + expected["CH4"][:] = 1 * ureg("Gg CH4 / year") + expected["CO2"].loc[{"time": "2020"}] = 2 * ureg("Gg CO2 / year") + expected["SF6"].loc[{"time": "2020"}] = 2 * ureg("Gg SF6 / year") + expected["CH4"].loc[{"time": "2020"}] = 2 * ureg("Gg CH4 / year") + expected["CO2"].loc[{"time": "2010"}] = 0 * ureg("Gg CO2 / year") + expected["SF6"].loc[{"time": "2010"}] = 0 * ureg("Gg SF6 / year") + expected["CH4"].loc[{"time": "2010"}] = 0 * ureg("Gg CH4 / year") + + xr.testing.assert_identical(downscaled, expected)