Skip to content

Commit

Permalink
feat(api): add name argument to value_counts
Browse files Browse the repository at this point in the history
  • Loading branch information
jcrist committed Sep 11, 2024
1 parent 1652076 commit 24be184
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 38 deletions.
56 changes: 34 additions & 22 deletions ibis/expr/types/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2155,33 +2155,28 @@ def count(self, where: ir.BooleanValue | None = None) -> ir.IntegerScalar:
"""
return ops.Count(self, where=self._bind_to_parent_table(where)).to_expr()

def value_counts(self) -> ir.Table:
def value_counts(self, *, name: str | None = None) -> ir.Table:
"""Compute a frequency table.
Parameters
----------
name
The name to use for the frequency column. A suitable name will be
automatically generated if not provided.
Returns
-------
Table
Frequency table expression
The frequency table.
Examples
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.memtable({"chars": char} for char in "aabcddd")
>>> t
┏━━━━━━━━┓
┃ chars ┃
┡━━━━━━━━┩
│ string │
├────────┤
│ a │
│ a │
│ b │
│ c │
│ d │
│ d │
│ d │
└────────┘
>>> t = ibis.memtable({"chars": ["a", "a", "b", "c", "c", "c", "d", "d", "d", "d"]})
Compute the count of each unique value in "chars", ordered by "chars":
>>> t.chars.value_counts().order_by("chars")
┏━━━━━━━━┳━━━━━━━━━━━━━┓
┃ chars ┃ chars_count ┃
Expand All @@ -2190,13 +2185,30 @@ def value_counts(self) -> ir.Table:
├────────┼─────────────┤
│ a │ 2 │
│ b │ 1 │
│ c │ 1
│ d │ 3
│ c │ 3
│ d │ 4
└────────┴─────────────┘
Compute the count of each unique value in "chars" as a column named
"freq", ordered by "freq":
>>> t.chars.value_counts(name="freq").order_by("freq")
┏━━━━━━━━┳━━━━━━━┓
┃ chars ┃ freq ┃
┡━━━━━━━━╇━━━━━━━┩
│ string │ int64 │
├────────┼───────┤
│ b │ 1 │
│ a │ 2 │
│ c │ 3 │
│ d │ 4 │
└────────┴───────┘
"""
name = self.get_name()
metric = _.count().name(f"{name}_count")
return self.as_table().group_by(name).aggregate(metric)
colname = self.get_name()
if name is None:
name = f"{colname}_count"
t = self.as_table()
return t.group_by(t[colname]).aggregate(t.count().name(name))

def first(
self,
Expand Down
22 changes: 12 additions & 10 deletions ibis/tests/expr/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,19 +871,21 @@ def test_group_by_column_select_api(table):
getattr(grouped.f, fn)()


def test_value_counts_convenience(table):
# #152
result = table.g.value_counts()
expected = table.select("g").group_by("g").aggregate(g_count=lambda t: t.count())
def test_value_counts(table):
expr1 = table.g.value_counts()
expr2 = table[["g"]].group_by("g").aggregate(g_count=_.count())
assert expr1.columns == ["g", "g_count"]
assert_equal(expr1, expr2)

assert_equal(result, expected)
expr3 = table.g.value_counts(name="freq")
expr4 = table[["g"]].group_by("g").aggregate(freq=_.count())
assert expr3.columns == ["g", "freq"]
assert_equal(expr3, expr4)


def test_isin_value_counts(table):
# #157, this code path was untested before
bool_clause = table.g.notin(["1", "4", "7"])
# it works!
bool_clause.name("notin").value_counts()
def test_value_counts_on_window_function(table):
expr = (table.a - table.a.mean()).name("x").value_counts(name="count")
assert expr.columns == ["x", "count"]


def test_value_counts_unnamed_expr(con):
Expand Down
6 changes: 0 additions & 6 deletions ibis/tests/expr/test_value_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,12 +289,6 @@ def test_isin_notin_list(table, container):
assert isinstance(not_expr.op().arg, ops.InValues)


def test_value_counts(table, string_col):
bool_clause = table[string_col].notin(["1", "4", "7"])
expr = table.filter(bool_clause)[string_col].value_counts()
assert isinstance(expr, ir.Table)


def test_isin_notin_scalars():
a, b, c = (ibis.literal(x) for x in [1, 1, 2])

Expand Down

0 comments on commit 24be184

Please sign in to comment.