Merge pull request #317 from khaeru/issue/191

Generalize reporting.Quantity in preparation for sparse xarray
iiasa · Jun 19, 2020 · 9785f37 · 9785f37
2 parents 7d7a6cb + 28d59ae
commit 9785f37
Show file tree

Hide file tree

Showing 17 changed files with 856 additions and 410 deletions.
diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
@@ -4,6 +4,7 @@ Next release
 All changes
 -----------
 
+- :pull:`317`: Make :class:`reporting.Quantity` classes interchangeable.
 - :pull:`330`: Use GitHub Actions for continuous testing and integration.
 
 

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -102,6 +102,7 @@
     'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
     'pint': ('https://pint.readthedocs.io/en/stable/', None),
     'python': ('https://docs.python.org/3/', None),
+    'sparse': ('https://sparse.pydata.org/en/stable/', None),
     'xarray': ('https://xarray.pydata.org/en/stable/', None),
 }
 

diff --git a/doc/source/reporting.rst b/doc/source/reporting.rst
@@ -1,7 +1,7 @@
 .. currentmodule:: ixmp.reporting
 
 Reporting
-=========
+*********
 
 Top-level methods and classes:
 
@@ -11,7 +11,6 @@ Top-level methods and classes:
    Reporter
    Key
    Quantity
-   as_quantity
 
 Others:
 
@@ -184,12 +183,30 @@ Others:
           >>> foo('a b c')
           foo:a-b-c
 
-.. automodule:: ixmp.reporting
-   :members: Quantity, as_quantity
+.. autodata:: ixmp.reporting.Quantity(data, *args, **kwargs)
+   :annotation:
+
+The :data:`.Quantity` constructor converts its arguments to an internal, :class:`xarray.DataArray`-like data format:
+
+.. code-block:: python
+
+   # Existing data
+   data = pd.Series(...)
+
+   # Convert to a Quantity for use in reporting calculations
+   qty = Quantity(data, name="Quantity name", units="kg")
+   rep.add("new_qty", qty)
+
+Common :mod:`ixmp.reporting` usage, e.g. in :mod:`message_ix`, creates large, sparse data frames (billions of possible elements, but <1% populated); :class:`~xarray.DataArray`'s default, 'dense' storage format would be too large for available memory.
+
+- Currently, Quantity is :class:`.AttrSeries`, a wrapped :class:`pandas.Series` that behaves like a :class:`~xarray.DataArray`.
+- In the future, :mod:`ixmp.reporting` will use :class:`.SparseDataArray`, and eventually :class:`~xarray.DataArray` backed by sparse data, directly.
+
+The goal is that reporting code, including built-in and user computations, can treat quantity arguments as if they were :class:`~xarray.DataArray`.
 
 
 Computations
-------------
+============
 
 .. automodule:: ixmp.reporting.computations
    :members:
@@ -201,6 +218,7 @@ Computations
    Calculations:
 
    .. autosummary::
+      add
       aggregate
       apply_units
       disaggregate_shares
@@ -221,10 +239,27 @@ Computations
       concat
 
 
-Utilities
----------
+Internal format for reporting quantities
+========================================
 
-.. autoclass:: ixmp.reporting.quantity.AttrSeries
+.. currentmodule:: ixmp.reporting.quantity
+
+.. automodule:: ixmp.reporting.quantity
+   :members: assert_quantity
+
+.. currentmodule:: ixmp.reporting.attrseries
+
+.. automodule:: ixmp.reporting.attrseries
+   :members:
+
+.. currentmodule:: ixmp.reporting.sparsedataarray
+
+.. automodule:: ixmp.reporting.sparsedataarray
+   :members: SparseDataArray, SparseAccessor
+
+
+Utilities
+=========
 
 .. automodule:: ixmp.reporting.utils
    :members:
diff --git a/ixmp/cli.py b/ixmp/cli.py
@@ -92,7 +92,7 @@ def report(context, config, key):
     r.configure(config)
 
     # Print the target
-    print(r.get(key))
+    print(r.get(key).to_series().sort_index())
 
 
 @main.command('show-versions')

diff --git a/ixmp/reporting/__init__.py b/ixmp/reporting/__init__.py
@@ -44,7 +44,7 @@
 from .describe import describe_recursive
 from .exceptions import ComputationError
 from .key import Key
-from .quantity import Quantity, as_quantity
+from .quantity import Quantity
 from .utils import (
     REPLACE_UNITS,
     RENAME_DIMS,
@@ -56,7 +56,6 @@
     'Key',
     'Quantity',
     'Reporter',
-    'as_quantity',
     'configure',
 ]
 

diff --git a/ixmp/reporting/attrseries.py b/ixmp/reporting/attrseries.py
@@ -0,0 +1,197 @@
+import pandas as pd
+import pandas.core.indexes.base as ibase
+import xarray as xr
+
+
+class AttrSeries(pd.Series):
+    """:class:`pandas.Series` subclass imitating :class:`xarray.DataArray`.
+
+    The AttrSeries class provides similar methods and behaviour to
+    :class:`xarray.DataArray`, so that :mod:`ixmp.reporting.computations`
+    methods can use xarray-like syntax.
+
+    Parameters
+    ----------
+    units : str or pint.Unit, optional
+        Set the units attribute. The value is converted to :class:`pint.Unit`
+        and added to `attrs`.
+    attrs : :class:`~collections.abc.Mapping`, optional
+        Set the :attr:`~pandas.Series.attrs` of the AttrSeries. This attribute
+        was added in `pandas 1.0
+        <https://pandas.pydata.org/docs/whatsnew/v1.0.0.html>`_, but is not
+        currently supported by the Series constructor.
+    """
+
+    # See https://pandas.pydata.org/docs/development/extending.html
+    @property
+    def _constructor(self):
+        return AttrSeries
+
+    def __init__(self, data=None, *args, name=None, attrs=None, **kwargs):
+        attrs = attrs or dict()
+
+        if hasattr(data, 'attrs'):
+            # Use attrs from an existing object
+            new_attrs = data.attrs.copy()
+
+            # Overwrite with explicit attrs argument
+            new_attrs.update(attrs)
+            attrs = new_attrs
+
+        if isinstance(data, (AttrSeries, xr.DataArray)):
+            # Extract name from existing object or use the argument
+            name = ibase.maybe_extract_name(name, data, type(self))
+
+            # Pre-convert to pd.Series from xr.DataArray to preserve names and
+            # labels. For AttrSeries, this is a no-op (see below).
+            data = data.to_series()
+
+        # Don't pass attrs to pd.Series constructor; it currently does not
+        # accept them
+        super().__init__(data, *args, name=name, **kwargs)
+
+        # Update the attrs after initialization
+        self.attrs.update(attrs)
+
+    @classmethod
+    def from_series(cls, series, sparse=None):
+        """Like :meth:`xarray.DataArray.from_series`."""
+        return cls(series)
+
+    def assign_coords(self, **kwargs):
+        """Like :meth:`xarray.DataArray.assign_coords`."""
+        return pd.concat([self], keys=kwargs.values(), names=kwargs.keys())
+
+    @property
+    def coords(self):
+        """Like :attr:`xarray.DataArray.coords`. Read-only."""
+        result = dict()
+        for name, levels in zip(self.index.names, self.index.levels):
+            result[name] = xr.Dataset(None, coords={name: levels})[name]
+        return result
+
+    @property
+    def dims(self):
+        """Like :attr:`xarray.DataArray.dims`."""
+        return tuple(self.index.names)
+
+    def drop(self, label):
+        """Like :meth:`xarray.DataArray.drop`."""
+        return self.droplevel(label)
+
+    def item(self, *args):
+        """Like :meth:`xarray.DataArray.item`."""
+        if len(args) and args != (None,):
+            raise NotImplementedError
+        elif self.size != 1:
+            raise ValueError
+        return self.iloc[0]
+
+    def rename(self, new_name_or_name_dict):
+        """Like :meth:`xarray.DataArray.rename`."""
+        if isinstance(new_name_or_name_dict, dict):
+            return self.rename_axis(index=new_name_or_name_dict)
+        else:
+            return super().rename(new_name_or_name_dict)
+
+    def sel(self, indexers=None, drop=False, **indexers_kwargs):
+        """Like :meth:`xarray.DataArray.sel`."""
+        indexers = indexers or {}
+        indexers.update(indexers_kwargs)
+        if len(indexers) == 1:
+            level, key = list(indexers.items())[0]
+            if isinstance(key, str) and not drop:
+                if isinstance(self.index, pd.MultiIndex):
+                    # When using .loc[] to select 1 label on 1 level, pandas
+                    # drops the level. Use .xs() to avoid this behaviour unless
+                    # drop=True
+                    return AttrSeries(self.xs(key, level=level,
+                                              drop_level=False))
+                else:
+                    # No MultiIndex; use .loc with a slice to avoid returning
+                    # scalar
+                    return self.loc[slice(key, key)]
+
+        idx = tuple(indexers.get(n, slice(None)) for n in self.index.names)
+        return AttrSeries(self.loc[idx])
+
+    def sum(self, *args, **kwargs):
+        """Like :meth:`xarray.DataArray.sum`."""
+        obj = super(AttrSeries, self)
+        attrs = None
+
+        try:
+            dim = kwargs.pop('dim')
+        except KeyError:
+            dim = list(args)
+            args = tuple()
+
+        if len(dim) == len(self.index.names):
+            bad_dims = set(dim) - set(self.index.names)
+            if bad_dims:
+                raise ValueError(f'{bad_dims} not found in array dimensions '
+                                 f'{self.index.names}')
+            # Simple sum
+            kwargs = {}
+        else:
+            # pivot and sum across columns
+            obj = self.unstack(dim)
+            kwargs['axis'] = 1
+            # Result will be DataFrame; re-attach attrs when converted to
+            # AttrSeries
+            attrs = self.attrs
+
+        return AttrSeries(obj.sum(*args, **kwargs), attrs=attrs)
+
+    def squeeze(self, dim=None, *args, **kwargs):
+        """Like :meth:`xarray.DataArray.squeeze`."""
+        assert kwargs.pop("drop", True)
+
+        try:
+            idx = self.index.remove_unused_levels()
+        except AttributeError:
+            return self
+
+        to_drop = []
+        for i, name in enumerate(idx.names):
+            if dim and name != dim:
+                continue
+            elif len(idx.levels[i]) > 1:
+                if dim is None:
+                    continue
+                else:
+                    raise ValueError(
+                        "cannot select a dimension to squeeze out which has "
+                        "length greater than one"
+                    )
+
+            to_drop.append(name)
+
+        if dim and not to_drop:
+            # Specified dimension does not exist
+            raise KeyError(dim)
+
+        return self.droplevel(to_drop)
+
+    def transpose(self, *dims):
+        """Like :meth:`xarray.DataArray.transpose`."""
+        return self.reorder_levels(dims)
+
+    def to_dataframe(self):
+        """Like :meth:`xarray.DataArray.to_dataframe`."""
+        return self.to_frame()
+
+    def to_series(self):
+        """Like :meth:`xarray.DataArray.to_series`."""
+        return self
+
+    def align_levels(self, other):
+        """Work around https://github.com/pandas-dev/pandas/issues/25760.
+
+        Return a copy of *obj* with common levels in the same order as *ref*.
+        """
+        if not isinstance(self.index, pd.MultiIndex):
+            return self
+        common = [n for n in other.index.names if n in self.index.names]
+        unique = [n for n in self.index.names if n not in common]
+        return self.reorder_levels(common + unique)
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ Next release @@
     All changes
     -----------
+    - :pull:`317`: Make :class:`reporting.Quantity` classes interchangeable.
     - :pull:`330`: Use GitHub Actions for continuous testing and integration.
@@ Expand Down @@