Skip to content

Commit

Permalink
Query expression preflighting now prohibits crossing nests.
Browse files Browse the repository at this point in the history
Doing so never worked, but now it is correctly detected.
  • Loading branch information
gitosaurus committed Oct 31, 2024
1 parent b8707a9 commit db29663
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 32 deletions.
10 changes: 5 additions & 5 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from nested_pandas.series.dtype import NestedDtype

from ..series.packer import pack_sorted_df_into_struct
from .utils import NestingType, check_expr_nesting
from .utils import check_expr_nesting


class NestedPandasExprVisitor(PandasExprVisitor):
Expand Down Expand Up @@ -506,11 +506,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
raise ValueError(msg)
kwargs["level"] = kwargs.pop("level", 0) + 1
kwargs["target"] = None
# At present, the query expression must be either entirely within the
# nested namespace or the base namespace. Mixed structures are not
# At present, the query expression must be either entirely within a
# single nest, or have nothing but base columns. Mixed structures are not
# supported, so preflight the expression.
nesting_types = check_expr_nesting(expr)
if NestingType.NESTED in nesting_types and NestingType.BASE in nesting_types:
nest_names = check_expr_nesting(expr)
if len(nest_names) > 1:
raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
result = self.eval(expr, **kwargs)
# If the result is a _SeriesFromNest, then the evaluation has caused unpacking,
Expand Down
47 changes: 37 additions & 10 deletions src/nested_pandas/nestedframe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,56 @@ class NestingType(Enum):
NESTED = "nested"


def _expr_nesting_type(node: ast.expr | None) -> set[NestingType]:
def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[str, list]:
"""
Given an expression which contains references to both base and nested columns,
return a dictionary of the sub-expressions that should be evaluated independently.
The key of the dictionary is the name of the nested column, and will be a blank
string in the case of base columns. The value is a list of the parent nodes
that lead to sub-expressions that can be evaluated successfully.
While this is not in use today for automatically splitting expressions, it can
be used to detect whether an expression is suitably structured for evaluation:
the returned dictionary should have a single key.
"""
if not isinstance(node, ast.expr):
return set()
return {}
if isinstance(node, ast.Name):
return {NestingType.BASE}
if isinstance(node, ast.Attribute):
return {NestingType.NESTED}
return {"": parents}
if isinstance(node, ast.Attribute) and isinstance(node.value, ast.Name):
return {node.value.id: parents}
sources = (
[getattr(node, "left", None), getattr(node, "right", None)]
+ getattr(node, "values", [])
+ getattr(node, "comparators", [])
)
result: set[NestingType] = set()
result: dict[str, list] = {}
for s in sources:
result.update(_expr_nesting_type(s))
child = _actionable_splits(parents, s)
for k, v in child.items():
result[k] = result.get(k, []) + v
# After a complete traversal across sources, check for any necessary splits.
# If it's homogenous, move the split-node up the tree.
if len(result) == 1:
# Let the record of each parent node drift up the tree,
# and merge the subtrees into a single node, since by definition,
# this node is homogeneous over all of its children, and can
# be evaluated in a single step.
result = {k: [node] for k in result}
else:
# At this point, we need to split the expression. The idea here is that
# we want a succession of efficient queries, each of which will produce
# a subset of either the base or the nested columns.
pass
return result


def check_expr_nesting(expr: str) -> set[NestingType]:
def check_expr_nesting(expr: str) -> set[str]:
"""
Given a string expression, parse it and visit the resulting AST, surfacing
the nesting types. The purpose is to identify expressions that attempt
to mix base and nested columns, which will need to be handled specially.
to mix base and nested columns, or columns from two different nests.
"""
expr_tree = ast.parse(expr, mode="eval").body
return _expr_nesting_type(expr_tree)
separable = _actionable_splits([], expr_tree)
return set(list(separable.keys()))
29 changes: 12 additions & 17 deletions tests/nested_pandas/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import pytest
from nested_pandas import NestedFrame
from nested_pandas.nestedframe.utils import NestingType, check_expr_nesting
from nested_pandas.nestedframe.utils import check_expr_nesting
from nested_pandas.utils import count_nested


Expand Down Expand Up @@ -52,21 +52,16 @@ def test_check_expr_nesting():
used to ensure that an expression-based query does not try to combine base and nested
sub-expressions.
"""
assert check_expr_nesting("a > 2 & nested.c > 1") == {NestingType.NESTED, NestingType.BASE}
assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {NestingType.NESTED}
assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {NestingType.BASE}
assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {
NestingType.NESTED,
NestingType.BASE,
}
assert check_expr_nesting("a > 2 & nested.c > 1") == {"", "nested"}
assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {"nested"}
assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {""}
assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""}

# NOTE: this correctly captures the desired behavior here, but suggests that the two nests
# are interoperable, which is too strong a claim.
assert check_expr_nesting("a.b > 2 & c.d < 5") == {NestingType.NESTED}
assert check_expr_nesting("a.b > 2 & c.d < 5") == {"a", "c"}

assert check_expr_nesting("a>3") == {NestingType.BASE}
assert check_expr_nesting("a > 3") == {NestingType.BASE}
assert check_expr_nesting("test.a>5&b==2") == {NestingType.NESTED, NestingType.BASE}
assert check_expr_nesting("test.a > 5 & b == 2") == {NestingType.NESTED, NestingType.BASE}
assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {NestingType.NESTED}
assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {NestingType.NESTED}
assert check_expr_nesting("a>3") == {""}
assert check_expr_nesting("a > 3") == {""}
assert check_expr_nesting("test.a>5&b==2") == {"test", ""}
assert check_expr_nesting("test.a > 5 & b == 2") == {"test", ""}
assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {"a"}
assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {"a"}

0 comments on commit db29663

Please sign in to comment.