Skip to content

Commit

Permalink
Fix non-unique index case
Browse files Browse the repository at this point in the history
  • Loading branch information
hombit committed Nov 1, 2024
1 parent 232c97f commit a6b83d8
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 60 deletions.
91 changes: 53 additions & 38 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default
from pandas.api.types import is_bool_dtype
from pandas.core.computation.expr import PARSERS, PandasExprVisitor

from nested_pandas.series import packer
from nested_pandas.nestedframe.utils import extract_nest_names
from nested_pandas.series.dtype import NestedDtype

from ..series.packer import pack_sorted_df_into_struct
from .utils import extract_nest_names
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct


class NestedPandasExprVisitor(PandasExprVisitor):
Expand Down Expand Up @@ -219,10 +218,8 @@ def __setitem__(self, key, value):
"." in key and key.split(".")[0] in self.nested_columns
):
nested, col = key.split(".")
new_flat = self[nested].nest.to_flat()
new_flat[col] = value
packed = packer.pack(new_flat)
return super().__setitem__(nested, packed)
new_nested_series = self[nested].nest.with_flat_field(col, value)
return super().__setitem__(nested, new_nested_series)

# Adding a new nested structure from a column
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
Expand All @@ -231,8 +228,9 @@ def __setitem__(self, key, value):
if isinstance(value, pd.Series):
value.name = col
value = value.to_frame()
packed = packer.pack(value)
return super().__setitem__(new_nested, packed)
new_df = self.add_nested(value, name=new_nested)
self._update_inplace(new_df)
return None

return super().__setitem__(key, value)

Expand Down Expand Up @@ -286,12 +284,12 @@ def add_nested(
A new NestedFrame with the added nested column.
"""
# Add sources to objects
packed = packer.pack(obj, name=name, on=on, dtype=dtype)
packed = pack(obj, name=name, on=on, dtype=dtype)
new_df = self.copy()
return new_df.join(packed, how=how)

@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.
Expand All @@ -307,7 +305,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
in the list will attempt to be packed into a single nested column
with the name provided in `nested_name`. If None, is defined as all
columns not in `base_columns`.
index: str, or None
on: str or None
The name of a column to use as the new index. Typically, the index
should have a unique value per row for base columns, and should
repeat for nested columns. For example, a dataframe with two
Expand All @@ -333,11 +331,11 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
"""

# Resolve new index
if index is not None:
if on is not None:
# if a base column is chosen remove it
if index in base_columns:
base_columns = [col for col in base_columns if col != index]
df = df.set_index(index)
if on in base_columns:
base_columns = [col for col in base_columns if col != on]
df = df.set_index(on)

# drop duplicates on index
out_df = df[base_columns][~df.index.duplicated(keep="first")]
Expand Down Expand Up @@ -404,7 +402,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
raise ValueError("No columns were assigned as list columns.")

# Pack list columns into a nested column
packed_df = packer.pack_lists(df[list_columns])
packed_df = pack_lists(df[list_columns])
packed_df.name = name

# join the nested column to the base_column df
Expand Down Expand Up @@ -521,18 +519,37 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
# to the nest and repack. Otherwise, apply it to this instance as usual,
# since it operated on the base attributes.
if isinstance(result, _SeriesFromNest):
if not is_bool_dtype(result.dtype):
raise ValueError("Query condition must evaluate to a boolean Series")

Check warning on line 523 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L523

Added line #L523 was not covered by tests

nest_name, flat_nest = result.nest_name, result.flat_nest
new_flat_nest = flat_nest.loc[result]
result = self.copy()
result[nest_name] = pack_sorted_df_into_struct(new_flat_nest)

# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
flat_nest = flat_nest.set_index(self[nest_name].array.list_index)
query_result = result.set_axis(self[nest_name].array.list_index)
# Selecting flat values matching the query result
new_flat_nest = flat_nest[query_result]
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
else:
result = self.loc[result]
new_df = self.loc[result]

if inplace:
self._update_inplace(result)
self._update_inplace(new_df)
return None
else:
return result
return new_df

def _set_filtered_flat_df(self, nest_name, flat_df):
"""Set a filtered flat dataframe for a nested column
Here we assume that flat_df has filtered "ordinal" index,
e.g. flat_df.index == [0, 2, 2, 2], while self.index
is arbitrary (e.g. ["a", "b", "a"]),
and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
"""
new_df = self.reset_index(drop=True)
new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
return new_df.set_index(self.index)

def _resolve_dropna_target(self, on_nested, subset):
"""resolves the target layer for a given set of dropna kwargs"""
Expand Down Expand Up @@ -657,34 +674,32 @@ def dropna(
return super().dropna(
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
)
if ignore_index:
raise ValueError("ignore_index is not supported for nested columns")

Check warning on line 678 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L678

Added line #L678 was not covered by tests
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
target_flat = self[target].nest.to_flat()
target_flat = target_flat.set_index(self[target].array.list_index)
if inplace:
target_flat = self[target].nest.to_flat()
target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
inplace=True,
)
self[target] = packer.pack_flat(target_flat)
return self
# Or if not inplace
new_df = self.copy()
new_df[target] = packer.pack_flat(
new_df[target]
.nest.to_flat()
.dropna(
else:
target_flat = target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
inplace=False,
)
)
new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
if inplace:
self._update_inplace(new_df)
return None
return new_df

def reduce(self, func, *args, **kwargs) -> NestedFrame: # type: ignore[override]
Expand Down
6 changes: 6 additions & 0 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,12 @@ def num_chunks(self) -> int:
"""Number of chunks in underlying pyarrow.ChunkedArray"""
return self._chunked_array.num_chunks

@property
def list_index(self) -> np.ndarray:
"""Keys mapping values to lists"""
list_index = np.arange(len(self))
return np.repeat(list_index, np.diff(self.list_offsets))

def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
"""Iterate over single field nested lists, as numpy arrays
Expand Down
25 changes: 9 additions & 16 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def pack(
name: str | None = None,
*,
index=None,
on: None | str | list[str] | np.ndarray = None,
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> pd.Series:
"""Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
Expand All @@ -41,8 +41,8 @@ def pack(
index : convertable to pd.Index, optional
Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
and this value is used to override the index after the nesting.
on: str or list of str or np.ndarray, optional
Column name(s) to join on, or a grouping key array. If None, the index is used.
on: str or list of str, optional
Column name(s) to join on. If None, the index is used.
dtype : dtype or None
NestedDtype of the output series, or other type to derive from. If None,
the dtype is inferred from the first non-missing dataframe.
Expand All @@ -60,9 +60,7 @@ def pack(
return pack_seq(obj, name=name, index=index, dtype=dtype)


def pack_flat(
df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] | np.ndarray = None
) -> pd.Series:
def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.
For the input dataframe with repeated indexes, make a pandas.Series,
Expand All @@ -78,8 +76,8 @@ def pack_flat(
Input dataframe, with repeated indexes.
name : str, optional
Name of the pd.Series.
on : str or list of str or np.ndarray, optional
Column name(s) to join on, or a grouping key array. If None, the index is used.
on : str or list of str, optional
Column name(s) to join on. If None, the df's index is used.
Returns
-------
Expand All @@ -93,15 +91,10 @@ def pack_flat(
nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
"""

if on is None:
df_reindexed = df
elif isinstance(on, np.ndarray):
df_reindexed = df.reindex(index=on)
else:
df_reindexed = df.set_index(on)

if on is not None:
df = df.set_index(on)

Check warning on line 95 in src/nested_pandas/series/packer.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/packer.py#L95

Added line #L95 was not covered by tests
# pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
sorted_flat = df_reindexed.sort_index(kind="stable")
sorted_flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(sorted_flat, name=name)


Expand Down
12 changes: 6 additions & 6 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,8 @@ def test_add_nested_for_empty_df():


@pytest.mark.parametrize("pandas", [False, True])
@pytest.mark.parametrize("index", [None, "a", "c"])
def test_from_flat(index, pandas):
@pytest.mark.parametrize("on", [None, "a", "c"])
def test_from_flat(on, pandas):
"""Test the NestedFrame.from_flat functionality"""

if pandas:
Expand All @@ -332,17 +332,17 @@ def test_from_flat(index, pandas):
index=[0, 0, 0, 1, 1],
)

out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], index=index, name="new_nested")
out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], on=on, name="new_nested")

if index is None:
if on is None:
assert list(out_nf.columns) == ["a", "b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
assert len(out_nf) == 2
elif index == "a":
elif on == "a":
assert list(out_nf.columns) == ["b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
assert len(out_nf) == 2
elif index == "c": # not what a user likely wants, but should still work
elif on == "c": # not what a user likely wants, but should still work
assert list(out_nf.columns) == ["a", "b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["d"]
assert len(out_nf) == 5
Expand Down

0 comments on commit a6b83d8

Please sign in to comment.