Skip to content

Commit

Permalink
ENH: Implement masked algorithm for value_counts (pandas-dev#54984)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Sep 30, 2023
1 parent c8e7a98 commit 6f0cd8d
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 46 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Other enhancements
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
-

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def value_count(
values: np.ndarray,
dropna: bool,
mask: npt.NDArray[np.bool_] | None = ...,
) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values]
) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values]

# arr and values should have same dtype
def ismember(
Expand Down
41 changes: 24 additions & 17 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
{{endif}}
cdef:
Py_ssize_t i = 0
Py_ssize_t i = 0, na_counter = 0, na_add = 0
Py_ssize_t n = len(values)
kh_{{ttype}}_t *table

Expand All @@ -49,9 +49,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
bint uses_mask = mask is not None
bint isna_entry = False

if uses_mask and not dropna:
raise NotImplementedError("uses_mask not implemented with dropna=False")

# we track the order in which keys are first seen (GH39009),
# khash-map isn't insertion-ordered, thus:
# table maps keys to counts
Expand Down Expand Up @@ -82,25 +79,31 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
for i in range(n):
val = {{to_c_type}}(values[i])

if uses_mask:
isna_entry = mask[i]

if dropna:
if uses_mask:
isna_entry = mask[i]
else:
if not uses_mask:
isna_entry = is_nan_{{c_type}}(val)

if not dropna or not isna_entry:
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
table.vals[k] += 1
if uses_mask and isna_entry:
na_counter += 1
else:
k = kh_put_{{ttype}}(table, val, &ret)
table.vals[k] = 1
result_keys.append(val)
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, val, &ret)
table.vals[k] = 1
result_keys.append(val)
{{endif}}

# collect counts in the order corresponding to result_keys:
if na_counter > 0:
na_add = 1
cdef:
int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64)

for i in range(table.size):
{{if dtype == 'object'}}
Expand All @@ -110,9 +113,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
{{endif}}
result_counts[i] = table.vals[k]

if na_counter > 0:
result_counts[table.size] = na_counter
result_keys.append(val)

kh_destroy_{{ttype}}(table)

return result_keys.to_array(), result_counts.base
return result_keys.to_array(), result_counts.base, na_counter


@cython.wraparound(False)
Expand Down Expand Up @@ -399,10 +406,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
ndarray[htfunc_t] modes

int64_t[::1] counts
int64_t count, max_count = -1
int64_t count, _, max_count = -1
Py_ssize_t nkeys, k, j = 0

keys, counts = value_count(values, dropna, mask=mask)
keys, counts, _ = value_count(values, dropna, mask=mask)
nkeys = len(keys)

modes = np.empty(nkeys, dtype=values.dtype)
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,7 @@ def value_counts_internal(

else:
values = _ensure_arraylike(values, func_name="value_counts")
keys, counts = value_counts_arraylike(values, dropna)
keys, counts, _ = value_counts_arraylike(values, dropna)
if keys.dtype == np.float16:
keys = keys.astype(np.float32)

Expand All @@ -949,7 +949,7 @@ def value_counts_internal(
# Called once from SparseArray, otherwise could be private
def value_counts_arraylike(
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
) -> tuple[ArrayLike, npt.NDArray[np.int64], int]:
"""
Parameters
----------
Expand All @@ -965,7 +965,7 @@ def value_counts_arraylike(
original = values
values = _ensure_data(values)

keys, counts = htable.value_count(values, dropna, mask=mask)
keys, counts, na_counter = htable.value_count(values, dropna, mask=mask)

if needs_i8_conversion(original.dtype):
# datetime, timedelta, or period
Expand All @@ -975,7 +975,7 @@ def value_counts_arraylike(
keys, counts = keys[mask], counts[mask]

res_keys = _reconstruct_data(keys, original.dtype, original)
return res_keys, counts
return res_keys, counts, na_counter


def duplicated(
Expand Down
32 changes: 13 additions & 19 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1044,28 +1044,22 @@ def value_counts(self, dropna: bool = True) -> Series:
)
from pandas.arrays import IntegerArray

keys, value_counts = algos.value_counts_arraylike(
self._data, dropna=True, mask=self._mask
keys, value_counts, na_counter = algos.value_counts_arraylike(
self._data, dropna=dropna, mask=self._mask
)
mask_index = np.zeros((len(value_counts),), dtype=np.bool_)
mask = mask_index.copy()

if dropna:
res = Series(value_counts, index=keys, name="count", copy=False)
res.index = res.index.astype(self.dtype)
res = res.astype("Int64")
return res
if na_counter > 0:
mask_index[-1] = True

# if we want nans, count the mask
counts = np.empty(len(value_counts) + 1, dtype="int64")
counts[:-1] = value_counts
counts[-1] = self._mask.sum()

index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
index = index.astype(self.dtype)

mask = np.zeros(len(counts), dtype="bool")
counts_array = IntegerArray(counts, mask)

return Series(counts_array, index=index, name="count", copy=False)
arr = IntegerArray(value_counts, mask)
index = Index(
self.dtype.construct_array_type()(
keys, mask_index # type: ignore[arg-type]
)
)
return Series(arr, index=index, name="count", copy=False)

@doc(ExtensionArray.equals)
def equals(self, other) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ def value_counts(self, dropna: bool = True) -> Series:
Series,
)

keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
fcounts = self.sp_index.ngaps
if fcounts > 0 and (not self._null_fill_value or not dropna):
mask = isna(keys) if self._null_fill_value else keys == self.fill_value
Expand Down
19 changes: 15 additions & 4 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,15 +586,26 @@ def test_value_count(self, dtype, writable):
expected = (np.arange(N) + N).astype(dtype)
values = np.repeat(expected, 5)
values.flags.writeable = writable
keys, counts = ht.value_count(values, False)
keys, counts, _ = ht.value_count(values, False)
tm.assert_numpy_array_equal(np.sort(keys), expected)
assert np.all(counts == 5)

def test_value_count_mask(self, dtype):
if dtype == np.object_:
pytest.skip("mask not implemented for object dtype")
values = np.array([1] * 5, dtype=dtype)
mask = np.zeros((5,), dtype=np.bool_)
mask[1] = True
mask[4] = True
keys, counts, na_counter = ht.value_count(values, False, mask=mask)
assert len(keys) == 2
assert na_counter == 2

def test_value_count_stable(self, dtype, writable):
# GH12679
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
values.flags.writeable = writable
keys, counts = ht.value_count(values, False)
keys, counts, _ = ht.value_count(values, False)
tm.assert_numpy_array_equal(keys, values)
assert np.all(counts == 1)

Expand Down Expand Up @@ -685,9 +696,9 @@ def test_unique_label_indices():
class TestHelpFunctionsWithNans:
def test_value_count(self, dtype):
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
keys, counts = ht.value_count(values, True)
keys, counts, _ = ht.value_count(values, True)
assert len(keys) == 0
keys, counts = ht.value_count(values, False)
keys, counts, _ = ht.value_count(values, False)
assert len(keys) == 1 and np.all(np.isnan(keys))
assert counts[0] == 3

Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/series/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,22 @@ def test_value_counts_complex_numbers(self, input_array, expected):
# GH 17927
result = Series(input_array).value_counts()
tm.assert_series_equal(result, expected)

def test_value_counts_masked(self):
# GH#54984
dtype = "Int64"
ser = Series([1, 2, None, 2, None, 3], dtype=dtype)
result = ser.value_counts(dropna=False)
expected = Series(
[2, 2, 1, 1],
index=Index([2, None, 1, 3], dtype=dtype),
dtype=dtype,
name="count",
)
tm.assert_series_equal(result, expected)

result = ser.value_counts(dropna=True)
expected = Series(
[2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count"
)
tm.assert_series_equal(result, expected)

0 comments on commit 6f0cd8d

Please sign in to comment.