Fix non-unique index case

lincc-frameworks · Nov 1, 2024 · a6b83d8 · a6b83d8
1 parent 232c97f
commit a6b83d8
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 60 deletions.
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -10,13 +10,12 @@
 from pandas._libs import lib
 from pandas._typing import Any, AnyAll, Axis, IndexLabel
 from pandas.api.extensions import no_default
+from pandas.api.types import is_bool_dtype
 from pandas.core.computation.expr import PARSERS, PandasExprVisitor
 
-from nested_pandas.series import packer
+from nested_pandas.nestedframe.utils import extract_nest_names
 from nested_pandas.series.dtype import NestedDtype
-
-from ..series.packer import pack_sorted_df_into_struct
-from .utils import extract_nest_names
+from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct
 
 
 class NestedPandasExprVisitor(PandasExprVisitor):
@@ -219,10 +218,8 @@ def __setitem__(self, key, value):
             "." in key and key.split(".")[0] in self.nested_columns
         ):
             nested, col = key.split(".")
-            new_flat = self[nested].nest.to_flat()
-            new_flat[col] = value
-            packed = packer.pack(new_flat)
-            return super().__setitem__(nested, packed)
+            new_nested_series = self[nested].nest.with_flat_field(col, value)
+            return super().__setitem__(nested, new_nested_series)
 
         # Adding a new nested structure from a column
         # Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
@@ -231,8 +228,9 @@ def __setitem__(self, key, value):
             if isinstance(value, pd.Series):
                 value.name = col
                 value = value.to_frame()
-            packed = packer.pack(value)
-            return super().__setitem__(new_nested, packed)
+            new_df = self.add_nested(value, name=new_nested)
+            self._update_inplace(new_df)
+            return None
 
         return super().__setitem__(key, value)
 
@@ -286,12 +284,12 @@ def add_nested(
             A new NestedFrame with the added nested column.
         """
         # Add sources to objects
-        packed = packer.pack(obj, name=name, on=on, dtype=dtype)
+        packed = pack(obj, name=name, on=on, dtype=dtype)
         new_df = self.copy()
         return new_df.join(packed, how=how)
 
     @classmethod
-    def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
+    def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
         """Creates a NestedFrame with base and nested columns from a flat
         dataframe.
 
@@ -307,7 +305,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
             in the list will attempt to be packed into a single nested column
             with the name provided in `nested_name`. If None, is defined as all
             columns not in `base_columns`.
-        index: str, or None
+        on: str or None
             The name of a column to use as the new index. Typically, the index
             should have a unique value per row for base columns, and should
             repeat for nested columns. For example, a dataframe with two
@@ -333,11 +331,11 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
         """
 
         # Resolve new index
-        if index is not None:
+        if on is not None:
             # if a base column is chosen remove it
-            if index in base_columns:
-                base_columns = [col for col in base_columns if col != index]
-            df = df.set_index(index)
+            if on in base_columns:
+                base_columns = [col for col in base_columns if col != on]
+            df = df.set_index(on)
 
         # drop duplicates on index
         out_df = df[base_columns][~df.index.duplicated(keep="first")]
@@ -404,7 +402,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
             raise ValueError("No columns were assigned as list columns.")
 
         # Pack list columns into a nested column
-        packed_df = packer.pack_lists(df[list_columns])
+        packed_df = pack_lists(df[list_columns])
         packed_df.name = name
 
         # join the nested column to the base_column df
@@ -521,18 +519,37 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
         # to the nest and repack.  Otherwise, apply it to this instance as usual,
         # since it operated on the base attributes.
         if isinstance(result, _SeriesFromNest):
+            if not is_bool_dtype(result.dtype):
+                raise ValueError("Query condition must evaluate to a boolean Series")
+
             nest_name, flat_nest = result.nest_name, result.flat_nest
-            new_flat_nest = flat_nest.loc[result]
-            result = self.copy()
-            result[nest_name] = pack_sorted_df_into_struct(new_flat_nest)
+
+            # Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
+            flat_nest = flat_nest.set_index(self[nest_name].array.list_index)
+            query_result = result.set_axis(self[nest_name].array.list_index)
+            # Selecting flat values matching the query result
+            new_flat_nest = flat_nest[query_result]
+            new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
         else:
-            result = self.loc[result]
+            new_df = self.loc[result]
 
         if inplace:
-            self._update_inplace(result)
+            self._update_inplace(new_df)
             return None
         else:
-            return result
+            return new_df
+
+    def _set_filtered_flat_df(self, nest_name, flat_df):
+        """Set a filtered flat dataframe for a nested column
+
+        Here we assume that flat_df has filtered "ordinal" index,
+        e.g. flat_df.index == [0, 2, 2, 2], while self.index
+        is arbitrary (e.g. ["a", "b", "a"]),
+        and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
+        """
+        new_df = self.reset_index(drop=True)
+        new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
+        return new_df.set_index(self.index)
 
     def _resolve_dropna_target(self, on_nested, subset):
         """resolves the target layer for a given set of dropna kwargs"""
@@ -657,34 +674,32 @@ def dropna(
             return super().dropna(
                 axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
             )
+        if ignore_index:
+            raise ValueError("ignore_index is not supported for nested columns")
         if subset is not None:
             subset = [col.split(".")[-1] for col in subset]
+        target_flat = self[target].nest.to_flat()
+        target_flat = target_flat.set_index(self[target].array.list_index)
         if inplace:
-            target_flat = self[target].nest.to_flat()
             target_flat.dropna(
                 axis=axis,
                 how=how,
                 thresh=thresh,
                 subset=subset,
-                inplace=inplace,
-                ignore_index=ignore_index,
+                inplace=True,
             )
-            self[target] = packer.pack_flat(target_flat)
-            return self
-        # Or if not inplace
-        new_df = self.copy()
-        new_df[target] = packer.pack_flat(
-            new_df[target]
-            .nest.to_flat()
-            .dropna(
+        else:
+            target_flat = target_flat.dropna(
                 axis=axis,
                 how=how,
                 thresh=thresh,
                 subset=subset,
-                inplace=inplace,
-                ignore_index=ignore_index,
+                inplace=False,
             )
-        )
+        new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
+        if inplace:
+            self._update_inplace(new_df)
+            return None
         return new_df
 
     def reduce(self, func, *args, **kwargs) -> NestedFrame:  # type: ignore[override]

diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -648,6 +648,12 @@ def num_chunks(self) -> int:
         """Number of chunks in underlying pyarrow.ChunkedArray"""
         return self._chunked_array.num_chunks
 
+    @property
+    def list_index(self) -> np.ndarray:
+        """Keys mapping values to lists"""
+        list_index = np.arange(len(self))
+        return np.repeat(list_index, np.diff(self.list_offsets))
+
     def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
         """Iterate over single field nested lists, as numpy arrays
 

diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py
@@ -27,7 +27,7 @@ def pack(
     name: str | None = None,
     *,
     index=None,
-    on: None | str | list[str] | np.ndarray = None,
+    on: None | str | list[str] = None,
     dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
 ) -> pd.Series:
     """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
@@ -41,8 +41,8 @@ def pack(
     index : convertable to pd.Index, optional
         Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
         and this value is used to override the index after the nesting.
-    on: str or list of str or np.ndarray, optional
-        Column name(s) to join on, or a grouping key array. If None, the index is used.
+    on: str or list of str, optional
+        Column name(s) to join on. If None, the index is used.
     dtype : dtype or None
         NestedDtype of the output series, or other type to derive from. If None,
         the dtype is inferred from the first non-missing dataframe.
@@ -60,9 +60,7 @@ def pack(
     return pack_seq(obj, name=name, index=index, dtype=dtype)
 
 
-def pack_flat(
-    df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] | np.ndarray = None
-) -> pd.Series:
+def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series:
     """Make a structure of lists representation of a "flat" dataframe.
 
     For the input dataframe with repeated indexes, make a pandas.Series,
@@ -78,8 +76,8 @@ def pack_flat(
         Input dataframe, with repeated indexes.
     name : str, optional
         Name of the pd.Series.
-    on : str or list of str or np.ndarray, optional
-        Column name(s) to join on, or a grouping key array. If None, the index is used.
+    on : str or list of str, optional
+        Column name(s) to join on. If None, the df's index is used.
 
     Returns
     -------
@@ -93,15 +91,10 @@ def pack_flat(
     nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
     """
 
-    if on is None:
-        df_reindexed = df
-    elif isinstance(on, np.ndarray):
-        df_reindexed = df.reindex(index=on)
-    else:
-        df_reindexed = df.set_index(on)
-
+    if on is not None:
+        df = df.set_index(on)
     # pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
-    sorted_flat = df_reindexed.sort_index(kind="stable")
+    sorted_flat = df.sort_index(kind="stable")
     return pack_sorted_df_into_struct(sorted_flat, name=name)
 
 

diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -317,8 +317,8 @@ def test_add_nested_for_empty_df():
 
 
 @pytest.mark.parametrize("pandas", [False, True])
-@pytest.mark.parametrize("index", [None, "a", "c"])
-def test_from_flat(index, pandas):
+@pytest.mark.parametrize("on", [None, "a", "c"])
+def test_from_flat(on, pandas):
     """Test the NestedFrame.from_flat functionality"""
 
     if pandas:
@@ -332,17 +332,17 @@ def test_from_flat(index, pandas):
             index=[0, 0, 0, 1, 1],
         )
 
-    out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], index=index, name="new_nested")
+    out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], on=on, name="new_nested")
 
-    if index is None:
+    if on is None:
         assert list(out_nf.columns) == ["a", "b", "new_nested"]
         assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
         assert len(out_nf) == 2
-    elif index == "a":
+    elif on == "a":
         assert list(out_nf.columns) == ["b", "new_nested"]
         assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
         assert len(out_nf) == 2
-    elif index == "c":  # not what a user likely wants, but should still work
+    elif on == "c":  # not what a user likely wants, but should still work
         assert list(out_nf.columns) == ["a", "b", "new_nested"]
         assert list(out_nf.new_nested.nest.fields) == ["d"]
         assert len(out_nf) == 5