Skip to content

Commit

Permalink
Implement 'on' for pack and .add_nested
Browse files Browse the repository at this point in the history
  • Loading branch information
hombit committed Nov 1, 2024
1 parent 1a2a1d2 commit 232c97f
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 5 deletions.
5 changes: 4 additions & 1 deletion src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ def add_nested(
name: str,
*,
how: str = "left",
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs input object to a nested column and adds it to the NestedFrame
Expand Down Expand Up @@ -272,6 +273,8 @@ def add_nested(
index, and sort it lexicographically.
- inner: form intersection of calling frame's index with other
frame's index, preserving the order of the calling index.
on : str, list of str, default: None
Columns to join on.
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
Expand All @@ -283,7 +286,7 @@ def add_nested(
A new NestedFrame with the added nested column.
"""
# Add sources to objects
packed = packer.pack(obj, name=name, dtype=dtype)
packed = packer.pack(obj, name=name, on=on, dtype=dtype)
new_df = self.copy()
return new_df.join(packed, how=how)

Expand Down
22 changes: 18 additions & 4 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def pack(
name: str | None = None,
*,
index=None,
on: None | str | list[str] | np.ndarray = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> pd.Series:
"""Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
Expand All @@ -40,6 +41,8 @@ def pack(
index : convertable to pd.Index, optional
Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
and this value is used to override the index after the nesting.
on: str or list of str or np.ndarray, optional
Column name(s) to join on, or a grouping key array. If None, the index is used.
dtype : dtype or None
NestedDtype of the output series, or other type to derive from. If None,
the dtype is inferred from the first non-missing dataframe.
Expand All @@ -50,14 +53,16 @@ def pack(
Output series.
"""
if isinstance(obj, pd.DataFrame):
nested = pack_flat(obj, name=name)
nested = pack_flat(obj, name=name, on=on)
if index is not None:
nested.index = index
return nested
return pack_seq(obj, name=name, index=index, dtype=dtype)


def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
def pack_flat(
df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] | np.ndarray = None
) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.
For the input dataframe with repeated indexes, make a pandas.Series,
Expand All @@ -73,6 +78,8 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
Input dataframe, with repeated indexes.
name : str, optional
Name of the pd.Series.
on : str or list of str or np.ndarray, optional
Column name(s) to join on, or a grouping key array. If None, the index is used.
Returns
-------
Expand All @@ -86,9 +93,16 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
"""

if on is None:
df_reindexed = df
elif isinstance(on, np.ndarray):
df_reindexed = df.reindex(index=on)

Check warning on line 99 in src/nested_pandas/series/packer.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/packer.py#L98-L99

Added lines #L98 - L99 were not covered by tests
else:
df_reindexed = df.set_index(on)

Check warning on line 101 in src/nested_pandas/series/packer.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/packer.py#L101

Added line #L101 was not covered by tests

# pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(flat, name=name)
sorted_flat = df_reindexed.sort_index(kind="stable")
return pack_sorted_df_into_struct(sorted_flat, name=name)


def pack_seq(
Expand Down

0 comments on commit 232c97f

Please sign in to comment.