From 2efd34ab45b273aa083ad369e17657181b1cef9d Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sat, 25 Apr 2020 21:19:15 +0200 Subject: [PATCH 01/22] Sort output in 'ixmp report' CLI command --- ixmp/cli.py | 2 +- ixmp/tests/reporting/test_reporting.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ixmp/cli.py b/ixmp/cli.py index 293df0690..1262deb0b 100644 --- a/ixmp/cli.py +++ b/ixmp/cli.py @@ -92,7 +92,7 @@ def report(context, config, key): r.configure(config) # Print the target - print(r.get(key)) + print(r.get(key).to_series().sort_index()) @main.command('show-versions') diff --git a/ixmp/tests/reporting/test_reporting.py b/ixmp/tests/reporting/test_reporting.py index 5aa04d1f2..6e9b3550b 100644 --- a/ixmp/tests/reporting/test_reporting.py +++ b/ixmp/tests/reporting/test_reporting.py @@ -620,12 +620,12 @@ def test_cli(ixmp_cli, test_mp, test_data_path): assert result.output.endswith( "i j " # Trailing whitespace """ -seattle new-york 2.5 - chicago 1.7 - topeka 1.8 -san-diego new-york 2.5 - chicago 1.8 +san-diego chicago 1.8 + new-york 2.5 topeka 1.4 +seattle chicago 1.7 + new-york 2.5 + topeka 1.8 Name: value, dtype: float64 """) From 263ccbdacac521e409f97355244a0a876bfad8d0 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sat, 25 Apr 2020 21:28:08 +0200 Subject: [PATCH 02/22] Split reporting.quantity and reporting.attrseries - Make Quantity() a factory method rather than class/type. - Quantity.CLASS is a string used to check the current implementation. - Remove reporting.as_quantity and all uses of this function; Quantity(...) now does the same. - Move AttrSeries.align_levels from computations.product. - Update testing.assert_qty_equal and .assert_qty_allclose. --- ixmp/reporting/__init__.py | 3 +- ixmp/reporting/attrseries.py | 163 ++++++++++++++++++ ixmp/reporting/computations.py | 37 ++-- ixmp/reporting/quantity.py | 196 +++------------------- ixmp/testing.py | 41 ++--- ixmp/tests/reporting/__init__.py | 1 + ixmp/tests/reporting/test_computations.py | 12 +- ixmp/tests/reporting/test_reporting.py | 18 +- 8 files changed, 235 insertions(+), 236 deletions(-) create mode 100644 ixmp/reporting/attrseries.py diff --git a/ixmp/reporting/__init__.py b/ixmp/reporting/__init__.py index 8e0341765..e1cd124e4 100644 --- a/ixmp/reporting/__init__.py +++ b/ixmp/reporting/__init__.py @@ -44,7 +44,7 @@ from .describe import describe_recursive from .exceptions import ComputationError from .key import Key -from .quantity import Quantity, as_quantity +from .quantity import Quantity from .utils import ( REPLACE_UNITS, RENAME_DIMS, @@ -56,7 +56,6 @@ 'Key', 'Quantity', 'Reporter', - 'as_quantity', 'configure', ] diff --git a/ixmp/reporting/attrseries.py b/ixmp/reporting/attrseries.py new file mode 100644 index 000000000..4cbed5f0d --- /dev/null +++ b/ixmp/reporting/attrseries.py @@ -0,0 +1,163 @@ +from collections.abc import Collection + +import pandas as pd +import pandas.core.indexes.base as ibase +import pint +import xarray as xr + + +class AttrSeries(pd.Series): + """:class:`pandas.Series` subclass imitating :class:`xarray.DataArray`. + + Future versions of :mod:`ixmp.reporting` will use :class:`xarray.DataArray` + as :class:`Quantity`; however, because :mod:`xarray` currently lacks sparse + matrix support, ixmp quantities may be too large for available memory. + + The AttrSeries class provides similar methods and behaviour to + :class:`xarray.DataArray`, so that :mod:`ixmp.reporting.computations` + methods can use xarray-like syntax. + + Parameters + ---------- + units : str or pint.Unit, optional + Set the units attribute. The value is converted to :class:`pint.Unit` + and added to `attrs`. + attrs : :class:`~collections.abc.Mapping`, optional + Set the :attr:`~pandas.Series.attrs` of the AttrSeries. This attribute + was added in `pandas 1.0 + `_, but is not + currently supported by the Series constructor. + """ + + # See https://pandas.pydata.org/docs/development/extending.html + @property + def _constructor(self): + return AttrSeries + + def __init__(self, data=None, *args, name=None, units=None, attrs=None, + **kwargs): + attrs = attrs or dict() + if units: + # Insert the units into the attrs + attrs['_unit'] = pint.Unit(units) + + if isinstance(data, (AttrSeries, xr.DataArray)): + # Use attrs from an existing object + new_attrs = data.attrs.copy() + + # Overwrite with explicit attrs argument + new_attrs.update(attrs) + attrs = new_attrs + + # Pre-convert to pd.Series from xr.DataArray to preserve names and + # labels. For AttrSeries, this is a no-op (see below). + name = ibase.maybe_extract_name(name, data, type(self)) + data = data.to_series() + + # Don't pass attrs to pd.Series constructor; it currently does not + # accept them + super().__init__(data, *args, name=name, **kwargs) + + # Update the attrs after initialization + self.attrs.update(attrs) + + @classmethod + def from_series(cls, series, sparse=None): + return cls(series) + + def assign_attrs(self, d): + self.attrs.update(d) + return self + + def assign_coords(self, **kwargs): + return pd.concat([self], keys=kwargs.values(), names=kwargs.keys()) + + @property + def coords(self): + """Read-only.""" + result = dict() + for name, levels in zip(self.index.names, self.index.levels): + result[name] = xr.Dataset(None, coords={name: levels})[name] + return result + + @property + def dims(self): + return tuple(self.index.names) + + def drop(self, label): + return self.droplevel(label) + + def rename(self, new_name_or_name_dict): + if isinstance(new_name_or_name_dict, dict): + return self.rename_axis(index=new_name_or_name_dict) + else: + return super().rename(new_name_or_name_dict) + + def sel(self, indexers=None, drop=False, **indexers_kwargs): + indexers = indexers or {} + indexers.update(indexers_kwargs) + if len(indexers) == 1: + level, key = list(indexers.items())[0] + if not isinstance(key, Collection) and not drop: + # When using .loc[] to select 1 label on 1 level, pandas drops + # the level. Use .xs() to avoid this behaviour unless drop=True + return AttrSeries(self.xs(key, level=level, drop_level=False)) + + idx = tuple(indexers.get(l, slice(None)) for l in self.index.names) + return AttrSeries(self.loc[idx]) + + def sum(self, *args, **kwargs): + obj = super(AttrSeries, self) + attrs = None + + try: + dim = kwargs.pop('dim') + except KeyError: + dim = list(args) + args = tuple() + + if isinstance(self.index, pd.MultiIndex): + if len(dim) == len(self.index.names): + # assume dimensions = full multi index, do simple sum + kwargs = {} + else: + # pivot and sum across columns + obj = self.unstack(dim) + kwargs['axis'] = 1 + attrs = self.attrs + else: + if dim != [self.index.name]: + raise ValueError(dim, self.index.name, self) + kwargs['level'] = dim + + return AttrSeries(obj.sum(*args, **kwargs), attrs=attrs) + + def squeeze(self, *args, **kwargs): + kwargs.pop('drop') + return super().squeeze(*args, **kwargs) if len(self) > 1 else self + + def as_xarray(self): + return xr.DataArray.from_series(self) + + def transpose(self, *dims): + return self.reorder_levels(dims) + + def to_dataframe(self): + return self.to_frame() + + def to_series(self): + return self + + def align_levels(self, other): + """Work around https://github.com/pandas-dev/pandas/issues/25760. + + Return a copy of *obj* with common levels in the same order as *ref*. + + .. todo:: remove when Quantity is xr.DataArray, or above issues is + closed. + """ + if not isinstance(self.index, pd.MultiIndex): + return self + common = [n for n in other.index.names if n in self.index.names] + unique = [n for n in self.index.names if n not in common] + return self.reorder_levels(common + unique) diff --git a/ixmp/reporting/computations.py b/ixmp/reporting/computations.py index 4f95b7f5a..88a394248 100644 --- a/ixmp/reporting/computations.py +++ b/ixmp/reporting/computations.py @@ -10,7 +10,7 @@ import pint import xarray as xr -from .quantity import AttrSeries, Quantity, as_quantity +from .quantity import Quantity from .utils import ( RENAME_DIMS, dims_for_qty, @@ -192,9 +192,10 @@ def data_for_quantity(ix_type, name, column, scenario, config): # log.debug(' '.join(map(str, info))) # Convert to a Quantity, assign attrbutes and name - qty = as_quantity(data[column]) \ - .assign_attrs(attrs) \ - .rename(name + ('-margin' if column == 'mrg' else '')) + qty = Quantity( + data[column], + name=name + ('-margin' if column == 'mrg' else ''), + attrs=attrs) try: # Remove length-1 dimensions for scalars @@ -259,7 +260,7 @@ def concat(*objs, **kwargs): Reporter. """ objs = filter_concat_args(objs) - if Quantity is AttrSeries: + if Quantity.CLASS == 'AttrSeries': kwargs.pop('dim') return pd.concat(objs, **kwargs) elif Quantity is xr.DataArray: # pragma: no cover @@ -281,24 +282,12 @@ def product(*quantities): # Initialize result values with first entry result, u_result = next(items) - def _align_levels(ref, obj): - """Work around https://github.com/pandas-dev/pandas/issues/25760 - - Return a copy of *obj* with common levels in the same order as *ref*. - - TODO remove when Quantity is xr.DataArray, or above issues is closed. - """ - if not isinstance(obj.index, pd.MultiIndex): - return obj - common = [n for n in ref.index.names if n in obj.index.names] - unique = [n for n in obj.index.names if n not in common] - return obj.reorder_levels(common + unique) - # Iterate over remaining entries for q, u in items: - if Quantity is AttrSeries: - result = (result * _align_levels(result, q)).dropna() - else: # pragma: no cover + if Quantity.CLASS == 'AttrSeries': + # Work around pandas-dev/pandas#25760; see attrseries.py + result = (result * q.align_levels(result)).dropna() + else: result = result * q u_result *= u @@ -321,7 +310,7 @@ def ratio(numerator, denominator): result = numerator / denominator result.attrs['_unit'] = u_num / u_denom - if Quantity is AttrSeries: + if Quantity.CLASS == 'AttrSeries': result.dropna(inplace=True) return result @@ -343,7 +332,7 @@ def select(qty, indexers, inverse=False): new_indexers = {} for dim, labels in indexers.items(): new_indexers[dim] = list(filter(lambda l: l not in labels, - qty.coords[dim])) + qty.coords[dim].data)) indexers = new_indexers return qty.sel(indexers) @@ -433,7 +422,7 @@ def load_file(path, dims={}, units=None): .rename(columns=dims) index_columns = list(dims.values()) - return as_quantity(data.set_index(index_columns)['value'], units=units) + return Quantity(data.set_index(index_columns)['value'], units=units) elif path.suffix in ('.xls', '.xlsx'): # TODO define expected Excel data input format raise NotImplementedError # pragma: no cover diff --git a/ixmp/reporting/quantity.py b/ixmp/reporting/quantity.py index 647348769..b18085fac 100644 --- a/ixmp/reporting/quantity.py +++ b/ixmp/reporting/quantity.py @@ -1,186 +1,40 @@ -from collections.abc import Collection - -import numpy import pandas as pd -import pandas.core.indexes.base as ibase import pint -import xarray as xr -class AttrSeries(pd.Series): - """:class:`pandas.Series` subclass imitating :class:`xarray.DataArray`. +class _QuantityFactory: + #: The current internal class used to represent reporting quantities. + #: :meth:`as_quantity` always converts to this type. + CLASS = 'AttrSeries' + # CLASS = 'SparseDataArray' - Future versions of :mod:`ixmp.reporting` will use :class:`xarray.DataArray` - as :class:`Quantity`; however, because :mod:`xarray` currently lacks sparse - matrix support, ixmp quantities may be too large for available memory. + def __call__(self, data, *args, **kwargs): + name = kwargs.pop('name', None) + units = kwargs.pop('units', None) + attrs = kwargs.pop('attrs', dict()) - The AttrSeries class provides similar methods and behaviour to - :class:`xarray.DataArray`, so that :mod:`ixmp.reporting.computations` - methods can use xarray-like syntax. + if self.CLASS == 'AttrSeries': + from .attrseries import AttrSeries as cls + elif self.CLASS == 'SparseDataArray': + from .sparsedataarray import SparseDataArray as cls - Parameters - ---------- - units : str or pint.Unit, optional - Set the units attribute. The value is converted to :class:`pint.Unit` - and added to `attrs`. - attrs : :class:`~collections.abc.Mapping`, optional - Set the :attr:`~pandas.Series.attrs` of the AttrSeries. This attribute - was added in `pandas 1.0 - `_, but is not - currently supported by the Series constructor. - """ + if isinstance(data, pd.Series): + result = cls.from_series(data) + elif self.CLASS == 'AttrSeries': + result = cls(data, *args, **kwargs) + else: + assert len(args) == len(kwargs) == 0, (args, kwargs) + result = data._sda.convert() - # See https://pandas.pydata.org/docs/development/extending.html - @property - def _constructor(self): - return AttrSeries + if name: + result.name = name - def __init__(self, data=None, *args, name=None, units=None, attrs=None, - **kwargs): - attrs = attrs or dict() if units: - # Insert the units into the attrs attrs['_unit'] = pint.Unit(units) - if isinstance(data, (AttrSeries, xr.DataArray)): - # Use attrs from an existing object - new_attrs = data.attrs.copy() - - # Overwrite with explicit attrs argument - new_attrs.update(attrs) - attrs = new_attrs - - # Pre-convert to pd.Series from xr.DataArray to preserve names and - # labels. For AttrSeries, this is a no-op (see below). - name = ibase.maybe_extract_name(name, data, type(self)) - data = data.to_series() - - # Don't pass attrs to pd.Series constructor; it currently does not - # accept them - super().__init__(data, *args, name=name, **kwargs) - - # Update the attrs after initialization - self.attrs.update(attrs) - - @classmethod - def from_series(cls, series, sparse=None): - return cls(series) - - def assign_attrs(self, d): - self.attrs.update(d) - return self - - def assign_coords(self, **kwargs): - return pd.concat([self], keys=kwargs.values(), names=kwargs.keys()) - - @property - def coords(self): - """Read-only.""" - return dict(zip(self.index.names, self.index.levels)) - - @property - def dims(self): - return tuple(self.index.names) - - def drop(self, label): - return self.droplevel(label) - - def rename(self, new_name_or_name_dict): - if isinstance(new_name_or_name_dict, dict): - return self.rename_axis(index=new_name_or_name_dict) - else: - return super().rename(new_name_or_name_dict) - - def sel(self, indexers=None, drop=False, **indexers_kwargs): - indexers = indexers or {} - indexers.update(indexers_kwargs) - if len(indexers) == 1: - level, key = list(indexers.items())[0] - if not isinstance(key, Collection) and not drop: - # When using .loc[] to select 1 label on 1 level, pandas drops - # the level. Use .xs() to avoid this behaviour unless drop=True - return AttrSeries(self.xs(key, level=level, drop_level=False)) - - idx = tuple(indexers.get(n, slice(None)) for n in self.index.names) - return AttrSeries(self.loc[idx]) - - def sum(self, *args, **kwargs): - try: - dim = kwargs.pop('dim') - if isinstance(self.index, pd.MultiIndex): - if len(dim) == len(self.index.names): - # assume dimensions = full multi index, do simple sum - obj = self - kwargs = {} - else: - # pivot and sum across columns - obj = self.unstack(dim) - kwargs['axis'] = 1 - else: - if dim != [self.index.name]: - raise ValueError(dim, self.index.name, self) - obj = super() - kwargs['level'] = dim - except KeyError: - obj = super() - return AttrSeries(obj.sum(*args, **kwargs)) - - def squeeze(self, *args, **kwargs): - kwargs.pop('drop') - return super().squeeze(*args, **kwargs) if len(self) > 1 else self - - def as_xarray(self): - return xr.DataArray.from_series(self) - - def transpose(self, *dims): - return self.reorder_levels(dims) - - def to_dataframe(self): - return self.to_frame() - - def to_series(self): - return self - - -#: The current internal class used to represent reporting quantities. -#: :meth:`as_quantity` always converts to this type. -Quantity = AttrSeries -# See also: -# - test_report_size() for a test that shows how non-sparse xr.DataArray -# triggers MemoryError. -# Quantity = xr.DataArray - - -def as_sparse_xarray(obj, units=None): # pragma: no cover - """Convert *obj* to :class:`xarray.DataArray` with sparse.COO storage.""" - import sparse - from xarray.core.dtypes import maybe_promote - - if isinstance(obj, xr.DataArray) and isinstance(obj.data, numpy.ndarray): - result = xr.DataArray( - data=sparse.COO.from_numpy( - obj.data, - fill_value=maybe_promote(obj.data.dtype)[1]), - coords=obj.coords, - dims=obj.dims, - name=obj.name, - attrs=obj.attrs, - ) - elif isinstance(obj, pd.Series): - result = xr.DataArray.from_series(obj, sparse=True) - else: - result = obj - - if units: - result.attrs['_unit'] = pint.Unit(units) + result.attrs.update(attrs) - return result + return result -#: Convert args to :class:`.Quantity` class. -#: -#: Returns -#: ------- -#: .Quantity -#: `obj` converted to the current Quantity type. -as_quantity = AttrSeries if Quantity is AttrSeries else as_sparse_xarray +Quantity = _QuantityFactory() diff --git a/ixmp/testing.py b/ixmp/testing.py index 4ccc8485d..c34736c03 100644 --- a/ixmp/testing.py +++ b/ixmp/testing.py @@ -59,6 +59,7 @@ from . import cli, config as ixmp_config from .core import Platform, TimeSeries, Scenario, IAMC_IDX +from .reporting import Quantity log = logging.getLogger(__name__) @@ -480,51 +481,43 @@ def test_foo(caplog): pytest.fail('\n'.join(lines)) -def assert_qty_equal(a, b, check_attrs=True, **kwargs): +def assert_qty_equal(a, b, check_type=True, check_attrs=True, **kwargs): """Assert that Quantity objects *a* and *b* are equal. When Quantity is AttrSeries, *a* and *b* are first passed through :meth:`as_quantity`. """ - from xarray import DataArray - from xarray.testing import assert_equal as assert_xr_equal - - from .reporting.quantity import AttrSeries, Quantity, as_quantity - - if Quantity is AttrSeries: - # Convert pd.Series automatically - a = as_quantity(a) if isinstance(a, (pd.Series, DataArray)) else a - b = as_quantity(b) if isinstance(b, (pd.Series, DataArray)) else b + if not check_type: + a = Quantity(a) + b = Quantity(b) + if Quantity.CLASS == 'AttrSeries': assert_series_equal(a, b, check_dtype=False, **kwargs) - elif Quantity is DataArray: # pragma: no cover - assert_xr_equal(a, b, **kwargs) + else: + import xarray.testing + xarray.testing.assert_equal(a, b, **kwargs) # check attributes are equal if check_attrs: assert a.attrs == b.attrs -def assert_qty_allclose(a, b, check_attrs=True, **kwargs): +def assert_qty_allclose(a, b, check_type=True, check_attrs=True, **kwargs): """Assert that Quantity objects *a* and *b* have numerically close values. When Quantity is AttrSeries, *a* and *b* are first passed through :meth:`as_quantity`. """ - from xarray import DataArray - from xarray.testing import assert_allclose as assert_xr_allclose - - from .reporting.quantity import AttrSeries, Quantity, as_quantity - - if Quantity is AttrSeries: - # Convert pd.Series automatically - a = as_quantity(a) if isinstance(a, (pd.Series, DataArray)) else a - b = as_quantity(b) if isinstance(b, (pd.Series, DataArray)) else b + if not check_type: + a = Quantity(a) + b = Quantity(b) + if Quantity.CLASS == 'AttrSeries': assert_series_equal(a, b, **kwargs) - elif Quantity is DataArray: # pragma: no cover + else: + import xarray.testing kwargs.pop('check_dtype', None) - assert_xr_allclose(a, b, **kwargs) + xarray.testing.assert_allclose(a._sda.dense, b._sda.dense, **kwargs) # check attributes are equal if check_attrs: diff --git a/ixmp/tests/reporting/__init__.py b/ixmp/tests/reporting/__init__.py index 7c92b1170..83530a0f7 100644 --- a/ixmp/tests/reporting/__init__.py +++ b/ixmp/tests/reporting/__init__.py @@ -23,6 +23,7 @@ def add_test_data(scen): x = xr.DataArray(np.random.rand(len(t), len(y)), coords=[t, y], dims=['t', 'y'], attrs={'_unit': ureg.Unit('kg')}) + x = Quantity(x) # As a pd.DataFrame with units x_df = x.to_series().rename('value').reset_index() diff --git a/ixmp/tests/reporting/test_computations.py b/ixmp/tests/reporting/test_computations.py index 46608c0f3..63291ce2c 100644 --- a/ixmp/tests/reporting/test_computations.py +++ b/ixmp/tests/reporting/test_computations.py @@ -5,7 +5,7 @@ import pytest import ixmp -from ixmp.reporting import Reporter, as_quantity, computations +from ixmp.reporting import Reporter, Quantity, computations from ixmp.testing import assert_logs from . import add_test_data @@ -53,19 +53,19 @@ def test_select(data): # Unpack *_, t_foo, t_bar, x = data - x = as_quantity(x) - assert len(x) == 6 * 6 + x = Quantity(x) + assert x.size == 6 * 6 # Selection with inverse=False indexers = {'t': t_foo[0:1] + t_bar[0:1]} result_0 = computations.select(x, indexers=indexers) - assert len(result_0) == 2 * 6 + assert result_0.size == 2 * 6 # Single indexer along one dimension results in 1D data indexers['y'] = '2010' result_1 = computations.select(x, indexers=indexers) - assert len(result_1) == 2 * 1 + assert result_1.size == 2 * 1 # Selection with inverse=True result_2 = computations.select(x, indexers=indexers, inverse=True) - assert len(result_2) == 4 * 5 + assert result_2.size == 4 * 5 diff --git a/ixmp/tests/reporting/test_reporting.py b/ixmp/tests/reporting/test_reporting.py index 6e9b3550b..1a54cdb9a 100644 --- a/ixmp/tests/reporting/test_reporting.py +++ b/ixmp/tests/reporting/test_reporting.py @@ -20,7 +20,7 @@ configure, computations, ) -from ixmp.reporting.quantity import AttrSeries, Quantity, as_quantity +from ixmp.reporting import Quantity from ixmp.testing import ( make_dantzig, assert_logs, @@ -173,7 +173,7 @@ def test_reporter_add_product(test_mp, ureg): assert key == 'x squared:t-y' # Product has the expected value - exp = as_quantity(x * x, name='x') + exp = Quantity(x * x, name='x') exp.attrs['_unit'] = ureg('kilogram ** 2').units assert_qty_equal(exp, rep.get(key)) @@ -203,7 +203,7 @@ def test_reporter_from_dantzig(test_mp, ureg): # Summation across all dimensions results a 1-element Quantity d = rep.get('d:') - assert d.shape == ((1,) if Quantity is AttrSeries else tuple()) + assert d.shape == ((1,) if Quantity.CLASS == 'AttrSeries' else tuple()) assert d.size == 1 assert np.isclose(d.values, 11.7) @@ -231,7 +231,7 @@ def test_reporter_from_dantzig(test_mp, ureg): # Disaggregation with explicit data # (cases of canned food 'p'acked in oil or water) shares = xr.DataArray([0.8, 0.2], coords=[['oil', 'water']], dims=['p']) - new_key = rep.disaggregate('b:j', 'p', args=[as_quantity(shares)]) + new_key = rep.disaggregate('b:j', 'p', args=[Quantity(shares)]) # ...produces the expected key with new dimension added assert new_key == 'b:j-p' @@ -377,7 +377,7 @@ def test_reporter_file(tmp_path): def test_file_formats(test_data_path, tmp_path): r = Reporter() - expected = as_quantity( + expected = Quantity( pd.read_csv(test_data_path / 'report-input0.csv', index_col=['i', 'j'])['value'], units='km') @@ -443,10 +443,10 @@ def test_units(ureg): # Create some dummy data dims = dict(coords=['a b c'.split()], dims=['x']) r.add('energy:x', - as_quantity(xr.DataArray([1., 3, 8], **dims), units='MJ')) + Quantity(xr.DataArray([1., 3, 8], **dims), units='MJ')) r.add('time', - as_quantity(xr.DataArray([5., 6, 8], **dims), units='hour')) - r.add('efficiency', as_quantity(xr.DataArray([0.9, 0.8, 0.95], **dims))) + Quantity(xr.DataArray([5., 6, 8], **dims), units='hour')) + r.add('efficiency', Quantity(xr.DataArray([0.9, 0.8, 0.95], **dims))) # Aggregation preserves units r.add('energy', (computations.sum, 'energy:x', None, ['x'])) @@ -701,7 +701,7 @@ def test_aggregate(test_mp): t_groups = {'foo': t_foo, 'bar': t_bar, 'baz': ['foo1', 'bar5', 'bar6']} # Use the computation directly - agg1 = computations.aggregate(as_quantity(x), {'t': t_groups}, True) + agg1 = computations.aggregate(Quantity(x), {'t': t_groups}, True) # Expected set of keys along the aggregated dimension assert set(agg1.coords['t'].values) == set(t) | set(t_groups.keys()) From 4b7248949eee20eef1945a6c5391c1d41877c482 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sat, 25 Apr 2020 21:36:01 +0200 Subject: [PATCH 03/22] Add reporting.sparsedataarray --- ixmp/reporting/sparsedataarray.py | 129 ++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 ixmp/reporting/sparsedataarray.py diff --git a/ixmp/reporting/sparsedataarray.py b/ixmp/reporting/sparsedataarray.py new file mode 100644 index 000000000..d090ac010 --- /dev/null +++ b/ixmp/reporting/sparsedataarray.py @@ -0,0 +1,129 @@ +from warnings import filterwarnings + +import numpy as np +import pandas as pd +import xarray as xr +from xarray.core.utils import either_dict_or_kwargs + +# sparse 0.9.1, numba 0.49.0 +filterwarnings( + action='ignore', + message="An import was requested from a module that has moved location.", + module='sparse._coo.numba_extension', + ) + +import sparse # noqa: E402 + + +@xr.register_dataarray_accessor('_sda') +class SparseAccessor: + """:mod:`xarray` accessor to help :class:`SparseDataArray`.""" + def __init__(self, obj): + if not isinstance(obj, xr.DataArray): + raise TypeError('._sda accessor only valid for xr.DataArray') + self.da = obj + + def convert(self): + """Return a :class:`SparseDataArray` instance.""" + if not self.da._sda.COO_data: + # Dense (numpy.ndarray) data; convert to sparse + data = sparse.COO.from_numpy(self.da.data, fill_value=None) + elif not np.isnan(self.da.data.fill_value): + # sparse.COO with non-NaN fill value; copy and change + data = self.da.data.copy(deep=False) + data.fill_value = data.dtype.type(np.nan) + else: + # No change + data = self.da.data + + if isinstance(self.da, SparseDataArray): + # Replace the variable, returning a copy + variable = self.da.variable._replace(data=data) + return self.da._replace(variable=variable) + else: + # Construct + return SparseDataArray( + data=data, + coords=self.da.coords, + dims=self.da.dims, + name=self.da.name, + attrs=self.da.attrs, + ) + + @property + def COO_data(self): + """:obj:`True` if the DataArray has :class:`sparse.COO` data.""" + return isinstance(self.da.data, sparse.COO) + + @property + def dense(self): + """Return a copy with dense (:class:`.ndarray`) data.""" + if self.COO_data: + # Use existing method xr.Variable._to_dense() + return self.da._replace(variable=self.da.variable._to_dense()) + else: + return self.da + + @property + def dense_super(self): + """Return a proxy to a :class:`.ndarray`-backed :class:`.DataArray`.""" + return super(SparseDataArray, self.dense) + + +class SparseDataArray(xr.DataArray): + """:class:`xr.DataArray` with sparse data. + + SparseDataArray uses :class:`sparse.COO` for storage with :data:`numpy.nan` + as its :attr:`sparse.COO.fill_value`. Some methods of :class:`.DataArray` + are overridden to ensure data is in sparse, or dense, format as necessary, + to provide expected functionality not currently supported by :mod:`sparse`, + and to avoid exhausting memory for some operations that require dense data. + + See Also + -------- + SparseAccessor + """ + __slots__ = tuple() + + @classmethod + def from_series(cls, obj, sparse=True): + # Call the parent method always with sparse=True, then re-wrap + return xr.DataArray.from_series(obj, sparse=True)._sda.convert() + + def equals(self, other): + """Necessary for :meth:`xarray.testing.assert_equal` to work.""" + return self.variable.equals(other.variable, equiv=np.equal) + + @property + def loc(self): + # FIXME doesn't allow assignment + return self._sda.dense_super.loc + + def sel(self, indexers=None, method=None, tolerance=None, drop=False, + **indexers_kwargs) -> 'SparseDataArray': + """Handle >1-D indexers with sparse data.""" + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, 'sel') + if isinstance(indexers, dict) and len(indexers) > 1: + result = self + for k, v in indexers.items(): + result = result.sel({k: v}, method=method, tolerance=tolerance, + drop=drop) + return result + else: + return super().sel(indexers=indexers, method=method, + tolerance=tolerance, drop=drop) + + def to_dataframe(self): + # FIXME this does exactly match the behaviour of xr.DataArray; it omits + # coordinate variable + return self.to_series().to_frame() + + def to_series(self) -> pd.Series: + # Use SparseArray.coords and .data (each already 1-D) to construct a + # pd.Series without first converting to a potentially very large + # ndarray + + # Construct a pd.MultiIndex without using .from_product + index = pd.MultiIndex.from_arrays(self.data.coords, names=self.dims) \ + .set_levels([self.coords[d].values for d in self.dims]) + return pd.Series(self.data.data, index=index, name=self.name) From e7fe571e35bf9f5a1722a51116fd462f83ecf649 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sat, 25 Apr 2020 21:38:09 +0200 Subject: [PATCH 04/22] Use SparseDataArray in tests --- ixmp/reporting/computations.py | 23 +++++++------ ixmp/tests/reporting/test_quantity.py | 48 +++++++++++++++------------ 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/ixmp/reporting/computations.py b/ixmp/reporting/computations.py index 88a394248..de6bd158f 100644 --- a/ixmp/reporting/computations.py +++ b/ixmp/reporting/computations.py @@ -226,10 +226,6 @@ def aggregate(quantity, groups, keep): Same dimensionality as `quantity`. """ - # NB .transpose() below is necessary only for Quantity == AttrSeries. It - # can be removed when Quantity = xr.DataArray. - dim_order = quantity.dims - attrs = quantity.attrs.copy() for dim, dim_groups in groups.items(): @@ -238,10 +234,16 @@ def aggregate(quantity, groups, keep): # Aggregate each group for group, members in dim_groups.items(): - values.append(quantity.sel({dim: members}) - .sum(dim=dim) - .assign_coords(**{dim: group}) - .transpose(*dim_order)) + agg = quantity.sel({dim: members}) \ + .sum(dim=dim) \ + .assign_coords(**{dim: group}) + if Quantity.CLASS == 'AttrSeries': + # .transpose() is necesary for AttrSeries + agg = agg.transpose(*quantity.dims) + else: + # Restore fill_value=NaN for compatibility + agg = agg._sda.convert() + values.append(agg) # Reassemble to a single dataarray quantity = concat(*values, dim=dim) @@ -263,8 +265,9 @@ def concat(*objs, **kwargs): if Quantity.CLASS == 'AttrSeries': kwargs.pop('dim') return pd.concat(objs, **kwargs) - elif Quantity is xr.DataArray: # pragma: no cover - return xr.concat(objs, **kwargs) + else: + # Correct fill-values + return xr.concat(objs, **kwargs)._sda.convert() def disaggregate_shares(quantity, shares): diff --git a/ixmp/tests/reporting/test_quantity.py b/ixmp/tests/reporting/test_quantity.py index 867290825..bfb13e50b 100644 --- a/ixmp/tests/reporting/test_quantity.py +++ b/ixmp/tests/reporting/test_quantity.py @@ -29,17 +29,17 @@ def test_assert(self, a): # Convert to pd.Series b = a.to_series() - assert_qty_equal(a, b) - assert_qty_equal(b, a) - assert_qty_allclose(a, b) - assert_qty_allclose(b, a) + assert_qty_equal(a, b, check_type=False) + assert_qty_equal(b, a, check_type=False) + assert_qty_allclose(a, b, check_type=False) + assert_qty_allclose(b, a, check_type=False) c = Quantity(a) - assert_qty_equal(a, c) - assert_qty_equal(c, a) - assert_qty_allclose(a, c) - assert_qty_allclose(c, a) + assert_qty_equal(a, c, check_type=True) + assert_qty_equal(c, a, check_type=True) + assert_qty_allclose(a, c, check_type=True) + assert_qty_allclose(c, a, check_type=True) def test_assert_with_attrs(self, a): """Test assertions about Quantity with attrs. @@ -88,8 +88,7 @@ def test_others(self, foo): assert foo.drop('a').dims == ('b',) -@pytest.mark.skip(reason="Pending #317") -def test_as_sparse_xarray(): +def test_sda_accessor(): """Test conversion to sparse.COO-backed xr.DataArray.""" x_series = pd.Series( data=[1., 2, 3, 4], @@ -98,25 +97,32 @@ def test_as_sparse_xarray(): ) y_series = pd.Series(data=[5., 6], index=pd.Index(['e', 'f'], name='baz')) - x = xr.DataArray.from_series(x_series, sparse=True) - y = xr.DataArray.from_series(y_series, sparse=True) + x = SparseDataArray.from_series(x_series) + y = SparseDataArray.from_series(y_series) - x_dense = xr.DataArray.from_series(x_series) - y_dense = xr.DataArray.from_series(y_series) + x_dense = x._sda.dense_super + y_dense = y._sda.dense_super + assert not x_dense._sda.COO_data or x_dense._sda.nan_fill + assert not y_dense._sda.COO_data or y_dense._sda.nan_fill with pytest.raises(ValueError, match='make sure that the broadcast shape'): x_dense * y - z1 = as_sparse_xarray(x_dense) * y - z2 = x * as_sparse_xarray(y_dense) - assert z1.dims == ('foo', 'bar', 'baz') + z1 = x_dense._sda.convert() * y + + z2 = x * y_dense._sda.convert() + assert z1.dims == ('foo', 'bar', 'baz') == z2.dims assert_xr_equal(z1, z2) - z3 = as_sparse_xarray(x) * as_sparse_xarray(y) + z3 = x._sda.convert() * y._sda.convert() assert_xr_equal(z1, z3) - z4 = as_sparse_xarray(x) * y + z4 = x._sda.convert() * y assert_xr_equal(z1, z4) - z5 = as_sparse_xarray(x_series) * y - assert_xr_equal(z1, z5) + # Doesn't work: can't align automatically + with pytest.raises(ValueError, match='Please make sure that the broadcast ' + 'shape of just the sparse arrays is the same as the ' + 'broadcast shape of all the operands.'): + z5 = SparseDataArray(x_series) * y + assert_xr_equal(z1, z5) From 0e3a2fee40653a80cbe156844a459034878529c7 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sat, 25 Apr 2020 21:45:48 +0200 Subject: [PATCH 05/22] Parametrize tests for both kinds of reporting.Quantity --- ixmp/testing.py | 11 +++++++++++ ixmp/tests/reporting/__init__.py | 3 +++ ixmp/tests/reporting/test_computations.py | 3 +++ ixmp/tests/reporting/test_quantity.py | 19 +++++++++---------- ixmp/tests/reporting/test_reporting.py | 2 ++ 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/ixmp/testing.py b/ixmp/testing.py index c34736c03..f15b04f27 100644 --- a/ixmp/testing.py +++ b/ixmp/testing.py @@ -98,6 +98,17 @@ def invoke(self, *args, **kwargs): yield Runner() +@pytest.fixture(params=['AttrSeries', 'SparseDataArray']) +def parametrize_quantity_class(request): + """Fixture to run tests twice, for both reporting Quantity classes.""" + pre = Quantity.CLASS + + Quantity.CLASS = request.param + yield + + Quantity.CLASS = pre + + @pytest.fixture def protect_pint_app_registry(): """Protect pint's application registry. diff --git a/ixmp/tests/reporting/__init__.py b/ixmp/tests/reporting/__init__.py index 83530a0f7..cae155e76 100644 --- a/ixmp/tests/reporting/__init__.py +++ b/ixmp/tests/reporting/__init__.py @@ -2,6 +2,9 @@ import pint import xarray as xr +from ixmp.reporting import Quantity + + REGISTRY = pint.get_application_registry() diff --git a/ixmp/tests/reporting/test_computations.py b/ixmp/tests/reporting/test_computations.py index 63291ce2c..8042272f3 100644 --- a/ixmp/tests/reporting/test_computations.py +++ b/ixmp/tests/reporting/test_computations.py @@ -11,6 +11,9 @@ from . import add_test_data +pytestmark = pytest.mark.usefixtures('parametrize_quantity_class') + + @pytest.fixture(scope='function') def data(test_mp, request): scen = ixmp.Scenario(test_mp, request.node.name, request.node.name, 'new') diff --git a/ixmp/tests/reporting/test_quantity.py b/ixmp/tests/reporting/test_quantity.py index bfb13e50b..cf8d93174 100644 --- a/ixmp/tests/reporting/test_quantity.py +++ b/ixmp/tests/reporting/test_quantity.py @@ -4,21 +4,20 @@ import xarray as xr from xarray.testing import assert_equal as assert_xr_equal -from ixmp.reporting.quantity import AttrSeries, Quantity, as_sparse_xarray +from ixmp import Reporter, Scenario +from ixmp.reporting import Quantity, computations +from ixmp.reporting.attrseries import AttrSeries +from ixmp.reporting.sparsedataarray import SparseDataArray from ixmp.testing import assert_qty_allclose, assert_qty_equal +@pytest.mark.usefixtures('parametrize_quantity_class') class TestQuantity: - """Tests of Quantity. - - NB. these tests should pass whether Quantity is set to AttrSeries or - xr.DataArray in ixmp.reporting.utils. As written, they only test the - current form of Quantity. @gidden tested both by hand-swapping the Quantity - class and running tests as of commit df1ec6f of PR #147. - """ - @pytest.fixture() + """Tests of Quantity.""" + @pytest.fixture def a(self): - yield xr.DataArray([0.8, 0.2], coords=[['oil', 'water']], dims=['p']) + da = xr.DataArray([0.8, 0.2], coords=[['oil', 'water']], dims=['p']) + yield Quantity(da) def test_assert(self, a): """Test assertions about Quantity. diff --git a/ixmp/tests/reporting/test_reporting.py b/ixmp/tests/reporting/test_reporting.py index 1a54cdb9a..ca877a4e7 100644 --- a/ixmp/tests/reporting/test_reporting.py +++ b/ixmp/tests/reporting/test_reporting.py @@ -31,6 +31,8 @@ from . import add_test_data +pytestmark = pytest.mark.usefixtures('parametrize_quantity_class') + test_args = ('Douglas Adams', 'Hitchhiker') TS_DF = {'year': [2010, 2020], 'value': [23.7, 23.8]} From c8252f86a9724c8d0e746ccd4f7b99fb19bcff20 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sat, 25 Apr 2020 21:46:58 +0200 Subject: [PATCH 06/22] Move test_report_size to TestQuantity.test_size --- ixmp/tests/reporting/test_quantity.py | 66 ++++++++++++++++++++++++++ ixmp/tests/reporting/test_reporting.py | 60 ----------------------- 2 files changed, 66 insertions(+), 60 deletions(-) diff --git a/ixmp/tests/reporting/test_quantity.py b/ixmp/tests/reporting/test_quantity.py index cf8d93174..df155be30 100644 --- a/ixmp/tests/reporting/test_quantity.py +++ b/ixmp/tests/reporting/test_quantity.py @@ -1,4 +1,5 @@ """Tests for ixmp.reporting.quantity.""" +import numpy as np import pandas as pd import pytest import xarray as xr @@ -19,6 +20,51 @@ def a(self): da = xr.DataArray([0.8, 0.2], coords=[['oil', 'water']], dims=['p']) yield Quantity(da) + @pytest.fixture(scope='class') + def scen_with_big_data(self, test_mp, num_params=10): + from itertools import zip_longest + + # test_mp.add_unit('kg') + scen = Scenario(test_mp, 'TestQuantity', 'big data', version='new') + + # Dimensions and their lengths (Fibonacci numbers) + N_dims = 6 + dims = 'abcdefgh'[:N_dims + 1] + sizes = [1, 5, 21, 21, 89, 377, 1597, 6765][:N_dims + 1] + + # commented: "377 / 73984365 elements = 0.00051% full" + # from functools import reduce + # from operator import mul + # size = reduce(mul, sizes) + # print('{} / {} elements = {:.5f}% full' + # .format(max(sizes), size, 100 * max(sizes) / size)) + + # Names like f_0000 ... f_1596 along each dimension + coords = [] + for d, N in zip(dims, sizes): + coords.append([f'{d}_{i:04d}' for i in range(N)]) + # Add to Scenario + scen.init_set(d) + scen.add_set(d, coords[-1]) + + def _make_values(): + """Make a DataFrame containing each label in *coords* ≥ 1 time.""" + values = list(zip_longest(*coords, np.random.rand(max(sizes)))) + result = pd.DataFrame(values, columns=list(dims) + ['value']) \ + .ffill() + result['unit'] = 'kg' + return result + + # Fill the Scenario with quantities named q_01 ... q_09 + names = [] + for i in range(num_params): + name = f'q_{i:02d}' + scen.init_par(name, list(dims)) + scen.add_par(name, _make_values()) + names.append(name) + + yield scen + def test_assert(self, a): """Test assertions about Quantity. @@ -64,6 +110,26 @@ def test_assert_with_attrs(self, a): a.attrs = {'bar': 'foo'} assert_qty_equal(a, b, check_attrs=False) + def test_size(self, scen_with_big_data): + """Stress-test reporting of large, sparse quantities.""" + scen = scen_with_big_data + + # Create the reporter + rep = Reporter.from_scenario(scen) + + # Add a task to compute the product, i.e. requires all the q_* + keys = [rep.full_key(name) for name in scen.par_list()] + rep.add('bigmem', tuple([computations.product] + keys)) + + # One quantity fits in memory + rep.get(keys[0]) + + # All quantities can be multiplied without raising MemoryError + result = rep.get('bigmem') + + # Result can be converted to pd.Series + result.to_series() + class TestAttrSeries: """Tests of AttrSeries in particular.""" diff --git a/ixmp/tests/reporting/test_reporting.py b/ixmp/tests/reporting/test_reporting.py index ca877a4e7..d375d08d8 100644 --- a/ixmp/tests/reporting/test_reporting.py +++ b/ixmp/tests/reporting/test_reporting.py @@ -632,66 +632,6 @@ def test_cli(ixmp_cli, test_mp, test_data_path): """) -def test_report_size(test_mp): - """Stress-test reporting of large, sparse quantities.""" - from itertools import zip_longest - - import numpy as np - - # test_mp.add_unit('kg') - scen = ixmp.Scenario(test_mp, 'size test', 'base', version='new') - - # Dimensions and their lengths (Fibonacci numbers) - N_dims = 6 - dims = 'abcdefgh'[:N_dims + 1] - sizes = [1, 5, 21, 21, 89, 377, 1597, 6765][:N_dims + 1] - - # commented: "377 / 73984365 elements = 0.00051% full" - # from functools import reduce - # from operator import mul - # size = reduce(mul, sizes) - # print('{} / {} elements = {:.5f}% full' - # .format(max(sizes), size, 100 * max(sizes) / size)) - - # Names like f_0000 ... f_1596 along each dimension - coords = [] - for d, N in zip(dims, sizes): - coords.append([f'{d}_{i:04d}' for i in range(N)]) - # Add to Scenario - scen.init_set(d) - scen.add_set(d, coords[-1]) - - def _make_values(): - """Make a DataFrame containing each label in *coords* at least once.""" - values = list(zip_longest(*coords, np.random.rand(max(sizes)))) - result = pd.DataFrame(values, columns=list(dims) + ['value']) \ - .ffill() - result['unit'] = 'kg' - return result - - # Fill the Scenario with quantities named q_01 ... q_09 - N = 10 - names = [] - for i in range(10): - name = f'q_{i:02d}' - scen.init_par(name, list(dims)) - scen.add_par(name, _make_values()) - names.append(name) - - # Create the reporter - rep = Reporter.from_scenario(scen) - - # Add an operation that takes the product, i.e. requires all the q_* - keys = [rep.full_key(name) for name in names] - rep.add('bigmem', tuple([computations.product] + keys)) - - # One quantity fits in memory - rep.get(keys[0]) - - # All quantities together trigger MemoryError - rep.get('bigmem') - - def test_aggregate(test_mp): scen = ixmp.Scenario(test_mp, 'Group reporting', 'group reporting', 'new') t, t_foo, t_bar, x = add_test_data(scen) From 21acfed05a229062ca7bc4dfb355243f2f05888b Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sun, 26 Apr 2020 12:21:58 +0200 Subject: [PATCH 07/22] Appease Stickler --- ixmp/reporting/sparsedataarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ixmp/reporting/sparsedataarray.py b/ixmp/reporting/sparsedataarray.py index d090ac010..ed4309c5d 100644 --- a/ixmp/reporting/sparsedataarray.py +++ b/ixmp/reporting/sparsedataarray.py @@ -10,7 +10,7 @@ action='ignore', message="An import was requested from a module that has moved location.", module='sparse._coo.numba_extension', - ) +) import sparse # noqa: E402 @@ -48,7 +48,7 @@ def convert(self): dims=self.da.dims, name=self.da.name, attrs=self.da.attrs, - ) + ) @property def COO_data(self): From c3723f479ec6f81de88ebc8f5fee28a040fbe123 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sun, 26 Apr 2020 21:11:43 +0200 Subject: [PATCH 08/22] Improve coverage to 100% in reporting.attrseries and .sparsedataframe --- ixmp/reporting/attrseries.py | 67 ++++++++++++++------------ ixmp/reporting/computations.py | 11 ++++- ixmp/reporting/sparsedataarray.py | 21 ++------ ixmp/tests/reporting/test_quantity.py | 37 +++++++++++--- ixmp/tests/reporting/test_reporting.py | 2 +- 5 files changed, 82 insertions(+), 56 deletions(-) diff --git a/ixmp/reporting/attrseries.py b/ixmp/reporting/attrseries.py index 4cbed5f0d..3540ea261 100644 --- a/ixmp/reporting/attrseries.py +++ b/ixmp/reporting/attrseries.py @@ -1,8 +1,5 @@ -from collections.abc import Collection - import pandas as pd import pandas.core.indexes.base as ibase -import pint import xarray as xr @@ -34,14 +31,10 @@ class AttrSeries(pd.Series): def _constructor(self): return AttrSeries - def __init__(self, data=None, *args, name=None, units=None, attrs=None, - **kwargs): + def __init__(self, data=None, *args, name=None, attrs=None, **kwargs): attrs = attrs or dict() - if units: - # Insert the units into the attrs - attrs['_unit'] = pint.Unit(units) - if isinstance(data, (AttrSeries, xr.DataArray)): + if hasattr(data, 'attrs'): # Use attrs from an existing object new_attrs = data.attrs.copy() @@ -49,9 +42,12 @@ def __init__(self, data=None, *args, name=None, units=None, attrs=None, new_attrs.update(attrs) attrs = new_attrs + if isinstance(data, (AttrSeries, xr.DataArray)): + # Extract name from existing object or use the argument + name = ibase.maybe_extract_name(name, data, type(self)) + # Pre-convert to pd.Series from xr.DataArray to preserve names and # labels. For AttrSeries, this is a no-op (see below). - name = ibase.maybe_extract_name(name, data, type(self)) data = data.to_series() # Don't pass attrs to pd.Series constructor; it currently does not @@ -65,10 +61,6 @@ def __init__(self, data=None, *args, name=None, units=None, attrs=None, def from_series(cls, series, sparse=None): return cls(series) - def assign_attrs(self, d): - self.attrs.update(d) - return self - def assign_coords(self, **kwargs): return pd.concat([self], keys=kwargs.values(), names=kwargs.keys()) @@ -87,6 +79,13 @@ def dims(self): def drop(self, label): return self.droplevel(label) + def item(self, *args): + if len(args) and args != (None,): + raise NotImplementedError + elif self.size != 1: + raise ValueError + return self.iloc[0] + def rename(self, new_name_or_name_dict): if isinstance(new_name_or_name_dict, dict): return self.rename_axis(index=new_name_or_name_dict) @@ -98,10 +97,17 @@ def sel(self, indexers=None, drop=False, **indexers_kwargs): indexers.update(indexers_kwargs) if len(indexers) == 1: level, key = list(indexers.items())[0] - if not isinstance(key, Collection) and not drop: - # When using .loc[] to select 1 label on 1 level, pandas drops - # the level. Use .xs() to avoid this behaviour unless drop=True - return AttrSeries(self.xs(key, level=level, drop_level=False)) + if isinstance(key, str) and not drop: + if isinstance(self.index, pd.MultiIndex): + # When using .loc[] to select 1 label on 1 level, pandas + # drops the level. Use .xs() to avoid this behaviour unless + # drop=True + return AttrSeries(self.xs(key, level=level, + drop_level=False)) + else: + # No MultiIndex; use .loc with a slice to avoid returning + # scalar + return self.loc[slice(key, key)] idx = tuple(indexers.get(l, slice(None)) for l in self.index.names) return AttrSeries(self.loc[idx]) @@ -116,19 +122,20 @@ def sum(self, *args, **kwargs): dim = list(args) args = tuple() - if isinstance(self.index, pd.MultiIndex): - if len(dim) == len(self.index.names): - # assume dimensions = full multi index, do simple sum - kwargs = {} - else: - # pivot and sum across columns - obj = self.unstack(dim) - kwargs['axis'] = 1 - attrs = self.attrs + if len(dim) == len(self.index.names): + bad_dims = set(dim) - set(self.index.names) + if bad_dims: + raise ValueError(f'{bad_dims} not found in array dimensions ' + f'{self.index.names}') + # Simple sum + kwargs = {} else: - if dim != [self.index.name]: - raise ValueError(dim, self.index.name, self) - kwargs['level'] = dim + # pivot and sum across columns + obj = self.unstack(dim) + kwargs['axis'] = 1 + # Result will be DataFrame; re-attach attrs when converted to + # AttrSeries + attrs = self.attrs return AttrSeries(obj.sum(*args, **kwargs), attrs=attrs) diff --git a/ixmp/reporting/computations.py b/ixmp/reporting/computations.py index de6bd158f..77684c591 100644 --- a/ixmp/reporting/computations.py +++ b/ixmp/reporting/computations.py @@ -5,10 +5,10 @@ from collections.abc import Mapping import logging from pathlib import Path +from warnings import filterwarnings import pandas as pd import pint -import xarray as xr from .quantity import Quantity from .utils import ( @@ -35,6 +35,15 @@ ] +# sparse 0.9.1, numba 0.49.0, triggered by xarray import +for msg in ["No direct replacement for 'numba.targets' available", + "An import was requested from a module that has moved location."]: + filterwarnings(action='ignore', message=msg, + module='sparse._coo.numba_extension') + +import xarray as xr # noqa: E402 + + log = logging.getLogger(__name__) # Carry unit attributes automatically diff --git a/ixmp/reporting/sparsedataarray.py b/ixmp/reporting/sparsedataarray.py index ed4309c5d..e141c7816 100644 --- a/ixmp/reporting/sparsedataarray.py +++ b/ixmp/reporting/sparsedataarray.py @@ -1,26 +1,14 @@ -from warnings import filterwarnings - import numpy as np import pandas as pd +import sparse # NB warnings from sparse are filtered in computations.py import xarray as xr from xarray.core.utils import either_dict_or_kwargs -# sparse 0.9.1, numba 0.49.0 -filterwarnings( - action='ignore', - message="An import was requested from a module that has moved location.", - module='sparse._coo.numba_extension', -) - -import sparse # noqa: E402 - @xr.register_dataarray_accessor('_sda') class SparseAccessor: """:mod:`xarray` accessor to help :class:`SparseDataArray`.""" def __init__(self, obj): - if not isinstance(obj, xr.DataArray): - raise TypeError('._sda accessor only valid for xr.DataArray') self.da = obj def convert(self): @@ -58,11 +46,8 @@ def COO_data(self): @property def dense(self): """Return a copy with dense (:class:`.ndarray`) data.""" - if self.COO_data: - # Use existing method xr.Variable._to_dense() - return self.da._replace(variable=self.da.variable._to_dense()) - else: - return self.da + # Use existing method xr.Variable._to_dense() + return self.da._replace(variable=self.da.variable._to_dense()) @property def dense_super(self): diff --git a/ixmp/tests/reporting/test_quantity.py b/ixmp/tests/reporting/test_quantity.py index df155be30..5e6cab3bd 100644 --- a/ixmp/tests/reporting/test_quantity.py +++ b/ixmp/tests/reporting/test_quantity.py @@ -139,18 +139,43 @@ def foo(self): names=['a', 'b']) yield AttrSeries([0, 1, 2, 3], index=idx) - def test_sum(self, foo): + @pytest.fixture + def bar(self): + yield AttrSeries([0, 1], index=pd.Index(['a1', 'a2'], name='a')) + + def test_rename(self, foo): + assert foo.rename({'a': 'c', 'b': 'd'}).dims == ('c', 'd') + + def test_sel(self, bar): + # Selecting 1 element from 1-D parameter still returns AttrSeries + result = bar.sel(a='a2') + assert isinstance(result, AttrSeries) + assert result.size == 1 + assert result.dims == ('a',) + assert result.iloc[0] == 1 + + def test_sum(self, foo, bar): # AttrSeries can be summed across all dimensions result = foo.sum(dim=['a', 'b']) assert isinstance(result, AttrSeries) # returns an AttrSeries - assert len(result) == 1 # with one element - assert result[0] == 6 # that has the correct value + assert result.size == 1 # with one element + assert result.item() == 6 # that has the correct value + + # Sum with wrong dim raises ValueError + with pytest.raises(ValueError): + bar.sum('b') - def test_others(self, foo): + def test_others(self, foo, bar): # Exercise other compatibility functions assert isinstance(foo.as_xarray(), xr.DataArray) assert type(foo.to_frame()) is pd.DataFrame assert foo.drop('a').dims == ('b',) + assert bar.dims == ('a',) + + with pytest.raises(NotImplementedError): + bar.item('a2') + with pytest.raises(ValueError): + bar.item() def test_sda_accessor(): @@ -189,5 +214,5 @@ def test_sda_accessor(): with pytest.raises(ValueError, match='Please make sure that the broadcast ' 'shape of just the sparse arrays is the same as the ' 'broadcast shape of all the operands.'): - z5 = SparseDataArray(x_series) * y - assert_xr_equal(z1, z5) + SparseDataArray(x_series) * y # = z5 + # assert_xr_equal(z1, z5) diff --git a/ixmp/tests/reporting/test_reporting.py b/ixmp/tests/reporting/test_reporting.py index d375d08d8..4a8af78e2 100644 --- a/ixmp/tests/reporting/test_reporting.py +++ b/ixmp/tests/reporting/test_reporting.py @@ -86,7 +86,7 @@ def test_reporter_add(): with pytest.raises(KeyExistsError, match=r"key 'a' already exists"): r.add('a', 5, strict=True) - def gen(other): + def gen(other): # pragma: no cover """A generator for apply().""" return (lambda a, b: a * b, 'a', other) From 21e872f8948feea6dfebc7db0cf2ff5f28f7bfe3 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sun, 10 May 2020 14:09:23 +0200 Subject: [PATCH 09/22] Catch more exceptions in reporting.utils.parse_units --- ixmp/reporting/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ixmp/reporting/utils.py b/ixmp/reporting/utils.py index 295db9c19..f443dcd5f 100644 --- a/ixmp/reporting/utils.py +++ b/ixmp/reporting/utils.py @@ -134,15 +134,16 @@ def define_unit_parts(expr): # Quantity has no unit unit = registry.parse_units('') except pint.UndefinedUnitError: - # Unit(s) do not exist; define them in the UnitRegistry - define_unit_parts(unit) - - # Try to parse again try: + # Unit(s) do not exist; define them in the UnitRegistry + define_unit_parts(unit) + + # Try to parse again unit = registry.parse_units(unit) - except pint.UndefinedUnitError: - # Handle the silent failure of define(), above - raise invalid(unit) # from None + except (pint.UndefinedUnitError, pint.RedefinitionError): + # Handle the silent failure of define(), above; or + # define_unit_parts didn't work + raise invalid(unit) except AttributeError: # Unit contains a character like '-' that throws off pint # NB this 'except' clause must be *after* UndefinedUnitError, since From 38a03e79acda76ef50e517a3f2fdcbc6bb3f7c5a Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 2 Jun 2020 23:19:29 +0200 Subject: [PATCH 10/22] Bump sparse requirement, adjust test_sda_accessor() --- ixmp/tests/reporting/test_quantity.py | 13 +++++-------- setup.cfg | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/ixmp/tests/reporting/test_quantity.py b/ixmp/tests/reporting/test_quantity.py index 5e6cab3bd..cecf9e7fa 100644 --- a/ixmp/tests/reporting/test_quantity.py +++ b/ixmp/tests/reporting/test_quantity.py @@ -195,8 +195,9 @@ def test_sda_accessor(): assert not x_dense._sda.COO_data or x_dense._sda.nan_fill assert not y_dense._sda.COO_data or y_dense._sda.nan_fill - with pytest.raises(ValueError, match='make sure that the broadcast shape'): - x_dense * y + # As of sparse 0.10, sparse `y` is automatically broadcast to `x_dense` + # Previously, this raised ValueError. + x_dense * y z1 = x_dense._sda.convert() * y @@ -210,9 +211,5 @@ def test_sda_accessor(): z4 = x._sda.convert() * y assert_xr_equal(z1, z4) - # Doesn't work: can't align automatically - with pytest.raises(ValueError, match='Please make sure that the broadcast ' - 'shape of just the sparse arrays is the same as the ' - 'broadcast shape of all the operands.'): - SparseDataArray(x_series) * y # = z5 - # assert_xr_equal(z1, z5) + z5 = SparseDataArray.from_series(x_series) * y + assert_xr_equal(z1, z5) diff --git a/setup.cfg b/setup.cfg index b6c96343a..ac7c19953 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,6 +20,7 @@ install_requires = pandas >= 1.0 pint PyYAML + sparse >= 0.10 xarray xlrd xlsxwriter @@ -36,7 +37,6 @@ tests = pretenders >= 1.4.4 pytest >= 5 pytest-cov - sparse docs = numpydoc sphinx >= 3.0 From 38c4271fe2cc7f3567eba5aedd2aa8e7d2d830b7 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Thu, 11 Jun 2020 13:38:47 +0200 Subject: [PATCH 11/22] Lint with flake8 --- ixmp/reporting/attrseries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ixmp/reporting/attrseries.py b/ixmp/reporting/attrseries.py index 3540ea261..a6bdfd178 100644 --- a/ixmp/reporting/attrseries.py +++ b/ixmp/reporting/attrseries.py @@ -109,7 +109,7 @@ def sel(self, indexers=None, drop=False, **indexers_kwargs): # scalar return self.loc[slice(key, key)] - idx = tuple(indexers.get(l, slice(None)) for l in self.index.names) + idx = tuple(indexers.get(n, slice(None)) for n in self.index.names) return AttrSeries(self.loc[idx]) def sum(self, *args, **kwargs): From a1b8156d7b2e180b91fc2a7bd20ebf5c77f78089 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 16:52:25 +0200 Subject: [PATCH 12/22] Add reporting.testing.random_qty --- ixmp/reporting/testing.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 ixmp/reporting/testing.py diff --git a/ixmp/reporting/testing.py b/ixmp/reporting/testing.py new file mode 100644 index 000000000..3b02db437 --- /dev/null +++ b/ixmp/reporting/testing.py @@ -0,0 +1,36 @@ +from typing import Dict + +import numpy as np +import xarray as xr + +from .quantity import Quantity + + +def random_qty(shape: Dict[str, int], **kwargs): + """Return a Quantity with *shape* and random contents. + + Parameters + ---------- + shape : dict + Mapping from dimension names to + kwargs + Other keyword arguments to :class:`Quantity`. + + Returns + ------- + Quantity + Keys in `shape`—e.g. "foo"—result in a dimension named "foo" with + coords "foo0", "foo1", etc., with total length matching the value. + Data is random. + """ + return Quantity( + xr.DataArray( + np.random.rand(*shape.values()), + coords={ + dim: [f"{dim}{i}" for i in range(length)] + for dim, length in shape.items() + }, + dims=shape.keys(), + ), + **kwargs, + ) From f25835bd3d426f1334d97962688df371d0152b49 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 16:53:30 +0200 Subject: [PATCH 13/22] Add reporting.quantity.assert_quantity() --- ixmp/reporting/quantity.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ixmp/reporting/quantity.py b/ixmp/reporting/quantity.py index b18085fac..77f3ba6ea 100644 --- a/ixmp/reporting/quantity.py +++ b/ixmp/reporting/quantity.py @@ -38,3 +38,19 @@ def __call__(self, data, *args, **kwargs): Quantity = _QuantityFactory() + + +def assert_quantity(*args): + """Assert that each of `args` is a Quantity object. + + Raises + ------ + TypeError + with a indicative message. + """ + for i, arg in enumerate(args): + if arg.__class__.__name__ != Quantity.CLASS: + raise TypeError( + f"arg #{i} ({repr(arg)}) is not Quantity; likely an incorrect " + "key" + ) From 269e45052c9cb7cb6d12d45f92645c5eaeb0c9ce Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 16:53:54 +0200 Subject: [PATCH 14/22] Add reporting.computations.add (from message_ix) --- ixmp/reporting/computations.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/ixmp/reporting/computations.py b/ixmp/reporting/computations.py index 77684c591..ba5ac64f0 100644 --- a/ixmp/reporting/computations.py +++ b/ixmp/reporting/computations.py @@ -10,7 +10,7 @@ import pandas as pd import pint -from .quantity import Quantity +from .quantity import Quantity, assert_quantity from .utils import ( RENAME_DIMS, dims_for_qty, @@ -50,6 +50,28 @@ xr.set_options(keep_attrs=True) +def add(*quantities, fill_value=0.0): + """Sum across multiple *quantities*.""" + # TODO check units + assert_quantity(*quantities) + + if Quantity.CLASS == "SparseDataArray": + quantities = map(Quantity, xr.broadcast(*quantities)) + + # Initialize result values with first entry + items = iter(quantities) + result = next(items) + + # Iterate over remaining entries + for q in items: + if Quantity.CLASS == 'AttrSeries': + result = result.add(q, fill_value=fill_value).dropna() + else: + result = result + q + + return result + + def apply_units(qty, units, quiet=False): """Simply apply *units* to *qty*. From da0b78b426f04143383609e61913bf580e8fea7e Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 16:57:15 +0200 Subject: [PATCH 15/22] Expand tests of computations.product --- ixmp/tests/reporting/test_computations.py | 36 ++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/ixmp/tests/reporting/test_computations.py b/ixmp/tests/reporting/test_computations.py index 8042272f3..3af40dfdb 100644 --- a/ixmp/tests/reporting/test_computations.py +++ b/ixmp/tests/reporting/test_computations.py @@ -1,12 +1,15 @@ import logging +import numpy as np from pandas.testing import assert_series_equal import pint import pytest +import xarray as xr import ixmp from ixmp.reporting import Reporter, Quantity, computations -from ixmp.testing import assert_logs +from ixmp.reporting.testing import random_qty +from ixmp.testing import assert_logs, assert_qty_equal from . import add_test_data @@ -52,6 +55,37 @@ def test_apply_units(data, caplog): assert_series_equal(result.to_series(), x.to_series()) +@pytest.mark.xfail( + reason="Outer join of non-intersecting dimensions (AttrSeries only)" +) +def test_product0(): + A = Quantity( + xr.DataArray([1, 2], coords=[["a0", "a1"]], dims=["a"]) + ) + B = Quantity( + xr.DataArray([3, 4], coords=[["b0", "b1"]], dims=["b"]) + ) + exp = Quantity( + xr.DataArray( + [[3, 4], [6, 8]], + coords=[["a0", "a1"], ["b0", "b1"]], + dims=["a", "b"], + ), + units="1", + ) + + assert_qty_equal(exp, computations.product(A, B)) + computations.product(exp, B) + + +def test_product1(): + """Product of quantities with overlapping dimensions.""" + A = random_qty(dict(a=2, b=2, c=2, d=2)) + B = random_qty(dict(b=2, c=2, d=2, e=2, f=2)) + + assert computations.product(A, B).size == 2 ** 6 + + def test_select(data): # Unpack *_, t_foo, t_bar, x = data From 8f7105ae563e2ed20c81aae155b18295df0f309b Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 17:00:09 +0200 Subject: [PATCH 16/22] Expand arguments accepted by Quantity() constructor --- ixmp/reporting/quantity.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ixmp/reporting/quantity.py b/ixmp/reporting/quantity.py index 77f3ba6ea..1ee9794ca 100644 --- a/ixmp/reporting/quantity.py +++ b/ixmp/reporting/quantity.py @@ -22,9 +22,12 @@ def __call__(self, data, *args, **kwargs): result = cls.from_series(data) elif self.CLASS == 'AttrSeries': result = cls(data, *args, **kwargs) - else: - assert len(args) == len(kwargs) == 0, (args, kwargs) + elif len(args) == len(kwargs) == 0: + # Single argument, possibly an xr.DataArray; convert to + # SparseDataArray result = data._sda.convert() + else: + result = cls(data, *args, **kwargs) if name: result.name = name From 3e21ac1d5d6ba5b7074a023060a6302be22b491b Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 17:02:46 +0200 Subject: [PATCH 17/22] Add tests of reporting.computations.add --- ixmp/tests/reporting/test_computations.py | 44 ++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/ixmp/tests/reporting/test_computations.py b/ixmp/tests/reporting/test_computations.py index 3af40dfdb..02bdfb652 100644 --- a/ixmp/tests/reporting/test_computations.py +++ b/ixmp/tests/reporting/test_computations.py @@ -20,8 +20,50 @@ @pytest.fixture(scope='function') def data(test_mp, request): scen = ixmp.Scenario(test_mp, request.node.name, request.node.name, 'new') + data_objs = list(add_test_data(scen)) rep = Reporter.from_scenario(scen) - yield [scen, rep] + list(add_test_data(scen)) + yield [scen, rep] + data_objs + + +@pytest.mark.parametrize("operands, size", [ + (("a", "a"), 18), + (("a", "x"), 36), + (("x", "b"), 36), + (("a", "b"), 36), + (("a", "x", "b"), 36), +]) +def test_add(data, operands, size): + scen, rep, t, t_foo, t_bar, x = data + + y = scen.set("y").tolist() + x = rep.get("x:t-y") + a = Quantity( + xr.DataArray( + np.random.rand(len(t_foo), len(y)), + coords=[t_foo, y], + dims=['t', 'y'] + ), + units=x.attrs['_unit'], + ) + b = Quantity( + xr.DataArray( + np.random.rand(len(t_bar), len(y)), + coords=[t_bar, y], + dims=['t', 'y'] + ), + units=x.attrs['_unit'], + ) + + rep.add("a:t-y", a) + rep.add("b:t-y", b) + + key = rep.add( + "result", + tuple([computations.add] + [f"{name}:t-y" for name in operands]) + ) + + result = rep.get(key) + assert size == result.size, result.to_series() def test_apply_units(data, caplog): From 176c36b3b7a22ad5f94b735daf91999e78a28a7a Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 19:10:23 +0200 Subject: [PATCH 18/22] Remove commented diagnostic code in computations.data_for_quantity --- ixmp/reporting/computations.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ixmp/reporting/computations.py b/ixmp/reporting/computations.py index ba5ac64f0..a129f6c09 100644 --- a/ixmp/reporting/computations.py +++ b/ixmp/reporting/computations.py @@ -211,17 +211,6 @@ def data_for_quantity(ix_type, name, column, scenario, config): data = data.rename(columns=RENAME_DIMS) \ .set_index(dims) - # Check sparseness - # try: - # shape = list(map(len, data.index.levels)) - # except AttributeError: - # shape = [data.index.size] - # size = reduce(mul, shape) - # filled = 100 * len(data) / size if size else 'NA' - # need_to_chunk = size > 1e7 and filled < 1 - # info = (name, shape, filled, size, need_to_chunk) - # log.debug(' '.join(map(str, info))) - # Convert to a Quantity, assign attrbutes and name qty = Quantity( data[column], From 6b51340c20921d2ddcc93f42e547291ef1716085 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 19:48:24 +0200 Subject: [PATCH 19/22] Improve AttrSeries.squeeze() --- ixmp/reporting/attrseries.py | 31 ++++++++++++++++++++++++--- ixmp/reporting/computations.py | 3 ++- ixmp/tests/reporting/test_quantity.py | 4 ++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/ixmp/reporting/attrseries.py b/ixmp/reporting/attrseries.py index a6bdfd178..498b63c47 100644 --- a/ixmp/reporting/attrseries.py +++ b/ixmp/reporting/attrseries.py @@ -139,9 +139,34 @@ def sum(self, *args, **kwargs): return AttrSeries(obj.sum(*args, **kwargs), attrs=attrs) - def squeeze(self, *args, **kwargs): - kwargs.pop('drop') - return super().squeeze(*args, **kwargs) if len(self) > 1 else self + def squeeze(self, dim=None, *args, **kwargs): + assert kwargs.pop("drop", True) + + try: + idx = self.index.remove_unused_levels() + except AttributeError: + return self + + to_drop = [] + for i, name in enumerate(idx.names): + if dim and name != dim: + continue + elif len(idx.levels[i]) > 1: + if dim is None: + continue + else: + raise ValueError( + "cannot select a dimension to squeeze out which has " + "length greater than one" + ) + + to_drop.append(name) + + if dim and not to_drop: + # Specified dimension does not exist + raise KeyError(dim) + + return self.droplevel(to_drop) def as_xarray(self): return xr.DataArray.from_series(self) diff --git a/ixmp/reporting/computations.py b/ixmp/reporting/computations.py index a129f6c09..72d432ec8 100644 --- a/ixmp/reporting/computations.py +++ b/ixmp/reporting/computations.py @@ -220,7 +220,8 @@ def data_for_quantity(ix_type, name, column, scenario, config): try: # Remove length-1 dimensions for scalars qty = qty.squeeze('index', drop=True) - except KeyError: + except (KeyError, ValueError): + # KeyError if "index" does not exist; ValueError if its length is > 1 pass return qty diff --git a/ixmp/tests/reporting/test_quantity.py b/ixmp/tests/reporting/test_quantity.py index cecf9e7fa..e8f0cb1e2 100644 --- a/ixmp/tests/reporting/test_quantity.py +++ b/ixmp/tests/reporting/test_quantity.py @@ -154,6 +154,10 @@ def test_sel(self, bar): assert result.dims == ('a',) assert result.iloc[0] == 1 + def test_squeeze(self, foo): + assert foo.sel(a="a1").squeeze().dims == ("b",) + assert foo.sel(a="a2", b="b1").squeeze().values == 2 + def test_sum(self, foo, bar): # AttrSeries can be summed across all dimensions result = foo.sum(dim=['a', 'b']) From a317fe43dc7171e56aff3e18b37795ecee0e0717 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 19:48:46 +0200 Subject: [PATCH 20/22] Move reporting assertions to reporting.testing --- ixmp/reporting/testing.py | 49 ++++++++++++++++++++++++++ ixmp/testing.py | 48 ++----------------------- ixmp/tests/reporting/test_reporting.py | 8 ++--- 3 files changed, 56 insertions(+), 49 deletions(-) diff --git a/ixmp/reporting/testing.py b/ixmp/reporting/testing.py index 3b02db437..ea32e896d 100644 --- a/ixmp/reporting/testing.py +++ b/ixmp/reporting/testing.py @@ -1,11 +1,60 @@ from typing import Dict import numpy as np +from pandas.testing import assert_series_equal import xarray as xr from .quantity import Quantity +def assert_qty_equal(a, b, check_type=True, check_attrs=True, **kwargs): + """Assert that Quantity objects *a* and *b* are equal. + + When Quantity is AttrSeries, *a* and *b* are first passed through + :meth:`as_quantity`. + """ + if not check_type: + a = Quantity(a) + b = Quantity(b) + + if Quantity.CLASS == 'AttrSeries': + try: + a = a.sort_index() + b = b.sort_index() + except TypeError: + pass + assert_series_equal(a, b, check_dtype=False, **kwargs) + else: + import xarray.testing + xarray.testing.assert_equal(a, b, **kwargs) + + # check attributes are equal + if check_attrs: + assert a.attrs == b.attrs + + +def assert_qty_allclose(a, b, check_type=True, check_attrs=True, **kwargs): + """Assert that Quantity objects *a* and *b* have numerically close values. + + When Quantity is AttrSeries, *a* and *b* are first passed through + :meth:`as_quantity`. + """ + if not check_type: + a = Quantity(a) + b = Quantity(b) + + if Quantity.CLASS == 'AttrSeries': + assert_series_equal(a.sort_index(), b.sort_index(), **kwargs) + else: + import xarray.testing + kwargs.pop('check_dtype', None) + xarray.testing.assert_allclose(a._sda.dense, b._sda.dense, **kwargs) + + # check attributes are equal + if check_attrs: + assert a.attrs == b.attrs + + def random_qty(shape: Dict[str, int], **kwargs): """Return a Quantity with *shape* and random contents. diff --git a/ixmp/testing.py b/ixmp/testing.py index f15b04f27..509581b83 100644 --- a/ixmp/testing.py +++ b/ixmp/testing.py @@ -54,13 +54,14 @@ from click.testing import CliRunner import numpy as np import pandas as pd -from pandas.testing import assert_series_equal import pytest from . import cli, config as ixmp_config from .core import Platform, TimeSeries, Scenario, IAMC_IDX from .reporting import Quantity - +from .reporting.testing import ( # noqa: F401 + assert_qty_equal, assert_qty_allclose +) log = logging.getLogger(__name__) @@ -492,49 +493,6 @@ def test_foo(caplog): pytest.fail('\n'.join(lines)) -def assert_qty_equal(a, b, check_type=True, check_attrs=True, **kwargs): - """Assert that Quantity objects *a* and *b* are equal. - - When Quantity is AttrSeries, *a* and *b* are first passed through - :meth:`as_quantity`. - """ - if not check_type: - a = Quantity(a) - b = Quantity(b) - - if Quantity.CLASS == 'AttrSeries': - assert_series_equal(a, b, check_dtype=False, **kwargs) - else: - import xarray.testing - xarray.testing.assert_equal(a, b, **kwargs) - - # check attributes are equal - if check_attrs: - assert a.attrs == b.attrs - - -def assert_qty_allclose(a, b, check_type=True, check_attrs=True, **kwargs): - """Assert that Quantity objects *a* and *b* have numerically close values. - - When Quantity is AttrSeries, *a* and *b* are first passed through - :meth:`as_quantity`. - """ - if not check_type: - a = Quantity(a) - b = Quantity(b) - - if Quantity.CLASS == 'AttrSeries': - assert_series_equal(a, b, **kwargs) - else: - import xarray.testing - kwargs.pop('check_dtype', None) - xarray.testing.assert_allclose(a._sda.dense, b._sda.dense, **kwargs) - - # check attributes are equal - if check_attrs: - assert a.attrs == b.attrs - - # Data structure for memory information used by :meth:`memory_usage`. _MemInfo = namedtuple('MemInfo', [ 'profiled', diff --git a/ixmp/tests/reporting/test_reporting.py b/ixmp/tests/reporting/test_reporting.py index 4a8af78e2..58f2c0944 100644 --- a/ixmp/tests/reporting/test_reporting.py +++ b/ixmp/tests/reporting/test_reporting.py @@ -223,10 +223,10 @@ def test_reporter_from_dantzig(test_mp, ureg): # ...produces the expected new value obs = rep.get(new_key) d_ij = rep.get('d:i-j') - exp = (d_ij * weights).sum(dim=['j']) / weights.sum(dim=['j']) - # FIXME attrs has to be explicitly copied here because math is done which - # returns a pd.Series - exp.attrs = d_ij.attrs + exp = Quantity( + (d_ij * weights).sum(dim=['j']) / weights.sum(dim=['j']), + attrs=d_ij.attrs, + ) assert_qty_equal(exp, obs) From d33e2e98c90ceecc29ed9b729b5e55dcf684f05a Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 22:16:39 +0200 Subject: [PATCH 21/22] Update reporting.quantity documentation --- doc/source/conf.py | 1 + doc/source/reporting.rst | 51 +++++++++++++++++---- ixmp/reporting/attrseries.py | 24 +++++----- ixmp/reporting/quantity.py | 26 ++++++++++- ixmp/reporting/sparsedataarray.py | 64 ++++++++++++++++++--------- ixmp/tests/reporting/test_quantity.py | 1 - 6 files changed, 124 insertions(+), 43 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 7cc6ec267..9b501bff9 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -102,6 +102,7 @@ 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 'pint': ('https://pint.readthedocs.io/en/stable/', None), 'python': ('https://docs.python.org/3/', None), + 'sparse': ('https://sparse.pydata.org/en/stable/', None), 'xarray': ('https://xarray.pydata.org/en/stable/', None), } diff --git a/doc/source/reporting.rst b/doc/source/reporting.rst index aadc92247..e83848fdc 100644 --- a/doc/source/reporting.rst +++ b/doc/source/reporting.rst @@ -1,7 +1,7 @@ .. currentmodule:: ixmp.reporting Reporting -========= +********* Top-level methods and classes: @@ -11,7 +11,6 @@ Top-level methods and classes: Reporter Key Quantity - as_quantity Others: @@ -184,12 +183,30 @@ Others: >>> foo('a b c') foo:a-b-c -.. automodule:: ixmp.reporting - :members: Quantity, as_quantity +.. autodata:: ixmp.reporting.Quantity(data, *args, **kwargs) + :annotation: + +The :data:`.Quantity` constructor converts its arguments to an internal, :class:`xarray.DataArray`-like data format: + +.. code-block:: python + + # Existing data + data = pd.Series(...) + + # Convert to a Quantity for use in reporting calculations + qty = Quantity(data, name="Quantity name", units="kg") + rep.add("new_qty", qty) + +Common :mod:`ixmp.reporting` usage, e.g. in :mod:`message_ix`, creates large, sparse data frames (billions of possible elements, but <1% populated); :class:`~xarray.DataArray`'s default, 'dense' storage format would be too large for available memory. + +- Currently, Quantity is :class:`.AttrSeries`, a wrapped :class:`pandas.Series` that behaves like a :class:`~xarray.DataArray`. +- In the future, :mod:`ixmp.reporting` will use :class:`.SparseDataArray`, and eventually :class:`~xarray.DataArray` backed by sparse data, directly. + +The goal is that reporting code, including built-in and user computations, can treat quantity arguments as if they were :class:`~xarray.DataArray`. Computations ------------- +============ .. automodule:: ixmp.reporting.computations :members: @@ -201,6 +218,7 @@ Computations Calculations: .. autosummary:: + add aggregate apply_units disaggregate_shares @@ -221,10 +239,27 @@ Computations concat -Utilities ---------- +Internal format for reporting quantities +======================================== -.. autoclass:: ixmp.reporting.quantity.AttrSeries +.. currentmodule:: ixmp.reporting.quantity + +.. automodule:: ixmp.reporting.quantity + :members: assert_quantity + +.. currentmodule:: ixmp.reporting.attrseries + +.. automodule:: ixmp.reporting.attrseries + :members: + +.. currentmodule:: ixmp.reporting.sparsedataarray + +.. automodule:: ixmp.reporting.sparsedataarray + :members: SparseDataArray, SparseAccessor + + +Utilities +========= .. automodule:: ixmp.reporting.utils :members: diff --git a/ixmp/reporting/attrseries.py b/ixmp/reporting/attrseries.py index 498b63c47..9dd18cb46 100644 --- a/ixmp/reporting/attrseries.py +++ b/ixmp/reporting/attrseries.py @@ -6,10 +6,6 @@ class AttrSeries(pd.Series): """:class:`pandas.Series` subclass imitating :class:`xarray.DataArray`. - Future versions of :mod:`ixmp.reporting` will use :class:`xarray.DataArray` - as :class:`Quantity`; however, because :mod:`xarray` currently lacks sparse - matrix support, ixmp quantities may be too large for available memory. - The AttrSeries class provides similar methods and behaviour to :class:`xarray.DataArray`, so that :mod:`ixmp.reporting.computations` methods can use xarray-like syntax. @@ -59,14 +55,16 @@ def __init__(self, data=None, *args, name=None, attrs=None, **kwargs): @classmethod def from_series(cls, series, sparse=None): + """Like :meth:`xarray.DataArray.from_series`.""" return cls(series) def assign_coords(self, **kwargs): + """Like :meth:`xarray.DataArray.assign_coords`.""" return pd.concat([self], keys=kwargs.values(), names=kwargs.keys()) @property def coords(self): - """Read-only.""" + """Like :attr:`xarray.DataArray.coords`. Read-only.""" result = dict() for name, levels in zip(self.index.names, self.index.levels): result[name] = xr.Dataset(None, coords={name: levels})[name] @@ -74,12 +72,15 @@ def coords(self): @property def dims(self): + """Like :attr:`xarray.DataArray.dims`.""" return tuple(self.index.names) def drop(self, label): + """Like :meth:`xarray.DataArray.drop`.""" return self.droplevel(label) def item(self, *args): + """Like :meth:`xarray.DataArray.item`.""" if len(args) and args != (None,): raise NotImplementedError elif self.size != 1: @@ -87,12 +88,14 @@ def item(self, *args): return self.iloc[0] def rename(self, new_name_or_name_dict): + """Like :meth:`xarray.DataArray.rename`.""" if isinstance(new_name_or_name_dict, dict): return self.rename_axis(index=new_name_or_name_dict) else: return super().rename(new_name_or_name_dict) def sel(self, indexers=None, drop=False, **indexers_kwargs): + """Like :meth:`xarray.DataArray.sel`.""" indexers = indexers or {} indexers.update(indexers_kwargs) if len(indexers) == 1: @@ -113,6 +116,7 @@ def sel(self, indexers=None, drop=False, **indexers_kwargs): return AttrSeries(self.loc[idx]) def sum(self, *args, **kwargs): + """Like :meth:`xarray.DataArray.sum`.""" obj = super(AttrSeries, self) attrs = None @@ -140,6 +144,7 @@ def sum(self, *args, **kwargs): return AttrSeries(obj.sum(*args, **kwargs), attrs=attrs) def squeeze(self, dim=None, *args, **kwargs): + """Like :meth:`xarray.DataArray.squeeze`.""" assert kwargs.pop("drop", True) try: @@ -168,25 +173,22 @@ def squeeze(self, dim=None, *args, **kwargs): return self.droplevel(to_drop) - def as_xarray(self): - return xr.DataArray.from_series(self) - def transpose(self, *dims): + """Like :meth:`xarray.DataArray.transpose`.""" return self.reorder_levels(dims) def to_dataframe(self): + """Like :meth:`xarray.DataArray.to_dataframe`.""" return self.to_frame() def to_series(self): + """Like :meth:`xarray.DataArray.to_series`.""" return self def align_levels(self, other): """Work around https://github.com/pandas-dev/pandas/issues/25760. Return a copy of *obj* with common levels in the same order as *ref*. - - .. todo:: remove when Quantity is xr.DataArray, or above issues is - closed. """ if not isinstance(self.index, pd.MultiIndex): return self diff --git a/ixmp/reporting/quantity.py b/ixmp/reporting/quantity.py index 1ee9794ca..2ded612fa 100644 --- a/ixmp/reporting/quantity.py +++ b/ixmp/reporting/quantity.py @@ -3,8 +3,30 @@ class _QuantityFactory: - #: The current internal class used to represent reporting quantities. - #: :meth:`as_quantity` always converts to this type. + """Convert arguments to the internal Quantity data format. + + Parameters + ---------- + data + Quantity data. + args + Positional arguments, passed to :class:`.AttrSeries` or + :class:`.SparseDataArray`. + kwargs + Keyword arguments, passed to :class:`.AttrSeries` or + :class:`.SparseDataArray`. + + Other parameters + ---------------- + name : str, optional + Quantity name. + units : str, optional + Quantity units. + attrs : dict, optional + Dictionary of attributes; similar to :attr:`~xarray.DataArray.attrs`. + """ + # The current internal class used to represent reporting quantities. + # :meth:`as_quantity` always converts to this type. CLASS = 'AttrSeries' # CLASS = 'SparseDataArray' diff --git a/ixmp/reporting/sparsedataarray.py b/ixmp/reporting/sparsedataarray.py index e141c7816..b65405fe0 100644 --- a/ixmp/reporting/sparsedataarray.py +++ b/ixmp/reporting/sparsedataarray.py @@ -7,7 +7,11 @@ @xr.register_dataarray_accessor('_sda') class SparseAccessor: - """:mod:`xarray` accessor to help :class:`SparseDataArray`.""" + """:mod:`xarray` accessor to help :class:`SparseDataArray`. + + See the xarray accessor documentation, e.g. + :func:`~xarray.register_dataarray_accessor`. + """ def __init__(self, obj): self.da = obj @@ -56,37 +60,48 @@ def dense_super(self): class SparseDataArray(xr.DataArray): - """:class:`xr.DataArray` with sparse data. + """:class:`~xarray.DataArray` with sparse data. SparseDataArray uses :class:`sparse.COO` for storage with :data:`numpy.nan` - as its :attr:`sparse.COO.fill_value`. Some methods of :class:`.DataArray` - are overridden to ensure data is in sparse, or dense, format as necessary, - to provide expected functionality not currently supported by :mod:`sparse`, - and to avoid exhausting memory for some operations that require dense data. - - See Also - -------- - SparseAccessor + as its :attr:`sparse.COO.fill_value`. Some methods of + :class:`~xarray.DataArray` are overridden to ensure data is in sparse, or + dense, format as necessary, to provide expected functionality not currently + supported by :mod:`sparse`, and to avoid exhausting memory for some + operations that require dense data. """ __slots__ = tuple() @classmethod def from_series(cls, obj, sparse=True): + """Convert a pandas.Series into a SparseDataArray.""" # Call the parent method always with sparse=True, then re-wrap return xr.DataArray.from_series(obj, sparse=True)._sda.convert() - def equals(self, other): - """Necessary for :meth:`xarray.testing.assert_equal` to work.""" + def equals(self, other) -> bool: + """True if two SparseDataArrays have the same dims, coords, and values. + + Overrides :meth:`~xarray.DataArray.equals` for sparse data. + """ + # Necessary for :meth:`xarray.testing.assert_equal` to work. return self.variable.equals(other.variable, equiv=np.equal) @property def loc(self): - # FIXME doesn't allow assignment + """Attribute for location based indexing like pandas. + + .. note:: This version does not allow assignment, since the underlying + sparse array is read-only. To modify the contents, create a copy or + perform an operation that returns a new array. + """ return self._sda.dense_super.loc def sel(self, indexers=None, method=None, tolerance=None, drop=False, **indexers_kwargs) -> 'SparseDataArray': - """Handle >1-D indexers with sparse data.""" + """Return a new array by selecting labels along the specified dim(s). + + Overrides :meth:`~xarray.DataArray.sel` to handle >1-D indexers with + sparse data. + """ indexers = either_dict_or_kwargs(indexers, indexers_kwargs, 'sel') if isinstance(indexers, dict) and len(indexers) > 1: result = self @@ -98,15 +113,22 @@ def sel(self, indexers=None, method=None, tolerance=None, drop=False, return super().sel(indexers=indexers, method=method, tolerance=tolerance, drop=drop) - def to_dataframe(self): - # FIXME this does exactly match the behaviour of xr.DataArray; it omits - # coordinate variable - return self.to_series().to_frame() + def to_dataframe(self, name=None): + """Convert this array and its coords into a :class:`~xarray.DataFrame`. + + Overrides :meth:`~xarray.DataArray.to_dataframe`. + """ + return self.to_series().to_frame(name) def to_series(self) -> pd.Series: - # Use SparseArray.coords and .data (each already 1-D) to construct a - # pd.Series without first converting to a potentially very large - # ndarray + """Convert this array into a :class:`~pandas.Series`. + + Overrides :meth:`~xarray.DataArray.to_series` to create the series + without first converting to a potentially very large + :class:`numpy.ndarray`. + """ + # Use SparseArray.coords and .data (each already 1-D) to construct the + # pd.Series # Construct a pd.MultiIndex without using .from_product index = pd.MultiIndex.from_arrays(self.data.coords, names=self.dims) \ diff --git a/ixmp/tests/reporting/test_quantity.py b/ixmp/tests/reporting/test_quantity.py index e8f0cb1e2..935c35eae 100644 --- a/ixmp/tests/reporting/test_quantity.py +++ b/ixmp/tests/reporting/test_quantity.py @@ -171,7 +171,6 @@ def test_sum(self, foo, bar): def test_others(self, foo, bar): # Exercise other compatibility functions - assert isinstance(foo.as_xarray(), xr.DataArray) assert type(foo.to_frame()) is pd.DataFrame assert foo.drop('a').dims == ('b',) assert bar.dims == ('a',) From 28d59ae3bc188e996ddeb051ac03fb8fec364783 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 19 Jun 2020 22:16:50 +0200 Subject: [PATCH 22/22] Add #317 to release notes --- RELEASE_NOTES.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index c55deac18..ed7c75e4e 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -4,6 +4,7 @@ Next release All changes ----------- +- :pull:`317`: Make :class:`reporting.Quantity` classes interchangeable. - :pull:`330`: Use GitHub Actions for continuous testing and integration.