Implement 'on' for pack and .add_nested

lincc-frameworks · Nov 1, 2024 · 232c97f · 232c97f
1 parent 1a2a1d2
commit 232c97f
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 5 deletions.
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -242,6 +242,7 @@ def add_nested(
         name: str,
         *,
         how: str = "left",
+        on: None | str | list[str] = None,
         dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
     ) -> Self:  # type: ignore[name-defined] # noqa: F821
         """Packs input object to a nested column and adds it to the NestedFrame
@@ -272,6 +273,8 @@ def add_nested(
               index, and sort it lexicographically.
             - inner: form intersection of calling frame's index with other
               frame's index, preserving the order of the calling index.
+        on : str, list of str, default: None
+            Columns to join on.
         dtype : dtype or None
             NestedDtype to use for the nested column; pd.ArrowDtype or
             pa.DataType can also be used to specify the nested dtype. If None,
@@ -283,7 +286,7 @@ def add_nested(
             A new NestedFrame with the added nested column.
         """
         # Add sources to objects
-        packed = packer.pack(obj, name=name, dtype=dtype)
+        packed = packer.pack(obj, name=name, on=on, dtype=dtype)
         new_df = self.copy()
         return new_df.join(packed, how=how)
 

diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py
@@ -27,6 +27,7 @@ def pack(
     name: str | None = None,
     *,
     index=None,
+    on: None | str | list[str] | np.ndarray = None,
     dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
 ) -> pd.Series:
     """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
@@ -40,6 +41,8 @@ def pack(
     index : convertable to pd.Index, optional
         Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
         and this value is used to override the index after the nesting.
+    on: str or list of str or np.ndarray, optional
+        Column name(s) to join on, or a grouping key array. If None, the index is used.
     dtype : dtype or None
         NestedDtype of the output series, or other type to derive from. If None,
         the dtype is inferred from the first non-missing dataframe.
@@ -50,14 +53,16 @@ def pack(
         Output series.
     """
     if isinstance(obj, pd.DataFrame):
-        nested = pack_flat(obj, name=name)
+        nested = pack_flat(obj, name=name, on=on)
         if index is not None:
             nested.index = index
         return nested
     return pack_seq(obj, name=name, index=index, dtype=dtype)
 
 
-def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
+def pack_flat(
+    df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] | np.ndarray = None
+) -> pd.Series:
     """Make a structure of lists representation of a "flat" dataframe.
 
     For the input dataframe with repeated indexes, make a pandas.Series,
@@ -73,6 +78,8 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
         Input dataframe, with repeated indexes.
     name : str, optional
         Name of the pd.Series.
+    on : str or list of str or np.ndarray, optional
+        Column name(s) to join on, or a grouping key array. If None, the index is used.
 
     Returns
     -------
@@ -86,9 +93,16 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
     nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
     """
 
+    if on is None:
+        df_reindexed = df
+    elif isinstance(on, np.ndarray):
+        df_reindexed = df.reindex(index=on)
+    else:
+        df_reindexed = df.set_index(on)
+
     # pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
-    flat = df.sort_index(kind="stable")
-    return pack_sorted_df_into_struct(flat, name=name)
+    sorted_flat = df_reindexed.sort_index(kind="stable")
+    return pack_sorted_df_into_struct(sorted_flat, name=name)
 
 
 def pack_seq(