Skip to content

Commit

Permalink
Merge pull request #80 from martindurant/cudf-docs
Browse files Browse the repository at this point in the history
Cudf docs
  • Loading branch information
martindurant authored Oct 21, 2024
2 parents f6f2ad7 + c1ca9e1 commit 5c12191
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 48 deletions.
12 changes: 9 additions & 3 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ Accessor
.. autosummary::
:toctree: generated/

akimbo.mixin.Accessor
Accessor

.. autoclass:: akimbo.mixin.Accessor
.. autoclass:: Accessor
:members:


Expand All @@ -37,13 +37,16 @@ Backends
akimbo.pandas.PandasAwkwardAccessor
akimbo.dask.DaskAwkwardAccessor
akimbo.polars.PolarsAwkwardAccessor
akimbo.cudf.CudfAwkwardAccessor

.. autoclass:: akimbo.pandas.PandasAwkwardAccessor

.. autoclass:: akimbo.dask.DaskAwkwardAccessor

.. autoclass:: akimbo.polars.PolarsAwkwardAccessor

.. autoclass:: akimbo.cudf.CudfAwkwardAccessor


Extensions
~~~~~~~~~~
Expand All @@ -54,7 +57,7 @@ being acted on. Check the ``dir()`` of each (or use tab-completion)
to see the operations available.

.. autoclass:: akimbo.datetimes.DatetimeAccessor
:members: cast
:members:

.. autoclass:: akimbo.strings.StringAccessor
:members:
Expand All @@ -63,3 +66,6 @@ to see the operations available.

<script data-goatcounter="https://akimbo.goatcounter.com/count"
async src="//gc.zgo.at/count.js"></script>

The cuDF backend also has these implemented with GPU-specific variants,
``akimbo.cudf.CudfStringAccessor`` and ``akimbo.cudf.CudfDatetimeAccessor``.
6 changes: 5 additions & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ identical syntax:
- pandas
- dask.dataframe
- polars
- cuDF (in development)
- cuDF


numpy-like API
Expand All @@ -34,6 +34,7 @@ for slicing and accessing data deep in nested structures,
Example: choose every second inner element in a list-of-lists

.. code-block:: python
series.ak[:, ::2]
Any function, ufunc or aggregation at any level
Expand All @@ -43,6 +44,7 @@ For manipulating numerics at deeper levels of your nested structures or
ragged arrays while maintaining the original layout

.. code-block:: python
series.ak.abs() # absolute for all numerical values
series.ak.sum(axis=3) # sum over deeply nested level
series.ak + 1 # numpy-like broadcasting into deeper levels
Expand All @@ -52,6 +54,7 @@ arrays of values, and they will only affect the appropriate parts of the structu
without changing the layout.

.. code-block:: python
series.ak.str.upper()
CPU/GPU numba support
Expand All @@ -64,6 +67,7 @@ in groupby/window operations. If your data is on the GPU, you can
use numba-cuda with slight modifications to your original function.

.. code-block:: python
@numba.njit
def sum_list_of_list(x):
total = 0
Expand Down
28 changes: 18 additions & 10 deletions src/akimbo/ak_from_cudf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import cudf
import pyarrow
import cupy
import numpy
from typing import Optional

from akimbo.utils import NoAttributes

try:
import cudf
import cupy
except ImportError:
cudf = NoAttributes()
cupy = NoAttributes()
import awkward as ak
from awkward._backends.numpy import NumpyBackend
import numpy
import pyarrow
from awkward._backends.cupy import CupyBackend

from awkward._backends.numpy import NumpyBackend

# COPIED from awkward/studies/cudf-to-awkward.py

Expand Down Expand Up @@ -351,7 +357,7 @@ def remove_revertable(layout, **kwargs):
def recurse_finalize(
out: ak.contents.Content,
column: cudf.core.column.column.ColumnBase,
validbits: None | cudf.core.buffer.buffer.Buffer,
validbits: Optional[cudf.core.buffer.buffer.Buffer],
generate_bitmasks: bool,
fix_offsets: bool = True,
):
Expand Down Expand Up @@ -569,13 +575,15 @@ def recurse(
validbits = column.base_mask

to64, dt = _pyarrow_to_numpy_dtype.get(str(arrow_type), (False, None))
if to64:
data = cupy.asarray(data).view(cupy.int32).astype(cupy.int64)
if dt is None:
dt = arrow_type.to_pandas_dtype()
if to64:
data = cupy.asarray(column.base_data).view(cupy.int32).astype(cupy.int64)
else:
data = cupy.asarray(column.base_data)

out = ak.contents.NumpyArray(
cupy.asarray(column.base_data).view(dt),
data.view(dt),
parameters=None,
backend=CupyBackend.instance(),
)
Expand Down
31 changes: 22 additions & 9 deletions src/akimbo/apply_tree.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
from __future__ import annotations

import functools
import inspect
from typing import Sequence
from typing import Callable, Literal, Sequence

import awkward as ak
import pyarrow as pa


class NoDtype:
kind = ""


def leaf(*layout, **_):
"""True for the lowest elements of any akwward layout tree"""
return layout[0].is_leaf


Expand Down Expand Up @@ -42,8 +41,22 @@ def func(layout, **kwargs):
return ak.transform(func, arr, *others)


def dec(func, match=leaf, outtype=None, inmode="arrow"):
"""Make a nested/ragged version of an operation to apply throughout a tree"""
def dec(
func: callable,
match: Callable[[ak.contents.Content], bool] = leaf,
outtype: Callable[[ak.contents.Content], ak.contents.Content] | None = None,
inmode: Literal["arrow", "numpy", "ak"] = "arrow",
):
"""Make a nested/ragged version of an operation to apply throughout a tree
Parameters
----------
func: which we want to apply to (parts of) inputted data
match: function to determine if a part of the data structure matches the type we want to
operate on
outtype: postprocessing function after transform
inmode: how ``func`` expects its inputs: as awkward arrays (ak), numpy or arrow
"""

@functools.wraps(func)
def f(self, *args, where=None, match_kwargs=None, **kwargs):
Expand Down Expand Up @@ -110,8 +123,8 @@ def f(self, *args, where=None, match_kwargs=None, **kwargs):
match_kwargs: None | dict
any extra field identifiers for matching a record as OK to process
{'-Kernel documentation follows from the original function-' if f.__doc__ else ''}
===
{'--Kernel documentation follows from the original function--' if f.__doc__ else ''}
{f.__doc__ or str(f)}
"""

Expand Down
40 changes: 29 additions & 11 deletions src/akimbo/cudf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,28 @@
from typing import Callable

import awkward as ak
import cudf
from cudf import DataFrame, Series, _lib as libcudf
from cudf.core.column.string import StringMethods
from cudf.core.column.datetime import DatetimeColumn

from akimbo.utils import NoAttributes

try:
import cudf
from cudf import DataFrame, Series
from cudf import _lib as libcudf
from cudf.core.column.datetime import DatetimeColumn
from cudf.core.column.string import StringMethods
except ImportError:
StringMethods = NoAttributes()
DatetimeColumn = NoAttributes()
libcudf = NoAttributes()
DataFrame = Series = NoAttributes()


from akimbo.ak_from_cudf import cudf_to_awkward as from_cudf
from akimbo.apply_tree import dec, leaf
from akimbo.datetimes import DatetimeAccessor
from akimbo.datetimes import match as match_t
from akimbo.mixin import Accessor
from akimbo.datetimes import DatetimeAccessor, match as match_t
from akimbo.strings import StringAccessor
from akimbo.apply_tree import dec, leaf


def match_string(arr):
Expand All @@ -22,14 +34,15 @@ class CudfStringAccessor(StringAccessor):
"""String operations on nested/var-length data"""

def decode(self, encoding: str = "utf-8"):
raise NotImplementedError("cudf does not support bytearray type, so we can't automatically identify them")
raise NotImplementedError(
"cudf does not support bytearray type, so we can't automatically identify them"
)

def encode(self, encoding: str = "utf-8"):
raise NotImplementedError("cudf does not support bytearray type")


def dec_cu(op, match=match_string):

@functools.wraps(op)
def f(lay, **kwargs):
# op(column, ...)->column
Expand All @@ -47,14 +60,15 @@ def f(lay, **kwargs):
def f(lay, method=meth, **kwargs):
# this is different from dec_cu, because we need to instantiate StringMethods
# before getting the method from it
col = getattr(StringMethods(cudf.Series(lay._to_cudf(cudf, None, len(lay)))), method)(**kwargs)
col = getattr(
StringMethods(cudf.Series(lay._to_cudf(cudf, None, len(lay)))), method
)(**kwargs)
return from_cudf(col).layout

setattr(CudfStringAccessor, meth, dec(func=f, match=match_string, inmode="ak"))


class CudfDatetimeAccessor(DatetimeAccessor):

...


Expand All @@ -76,7 +90,11 @@ def f(lay, method=meth, **kwargs):
return from_cudf(cudf.Series(col)).layout

if isinstance(getattr(DatetimeColumn, meth), property):
setattr(CudfDatetimeAccessor, meth, property(dec(func=f, match=match_t, inmode="ak")))
setattr(
CudfDatetimeAccessor,
meth,
property(dec(func=f, match=match_t, inmode="ak")),
)
else:
setattr(CudfDatetimeAccessor, meth, dec(func=f, match=match_t, inmode="ak"))

Expand Down
22 changes: 22 additions & 0 deletions src/akimbo/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
class NoAttributes:
"""Allows importing akimbo.cudf even if cudf isn't installed
This is done so that sphinx can still build docs on non-GPU systems.
"""

def __dir__(self):
return []

def __getattr__(self, item):
if item == "__qualname__":
return "akimbo.utils.DummyAttributesObject"
if item == "__type_params__":
return ()
return self

def __call__(self, *args, **kwargs):
return self

__name__ = "DummyAttributesObject"
__doc__ = None
__annotations__ = None
38 changes: 24 additions & 14 deletions tests/test_cudf.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import datetime

import pytest

import pyarrow as pa
import awkward as ak
import pyarrow as pa
import pytest

pytest.importorskip("akimbo.cudf")

import akimbo.cudf
import akimbo.io
import cudf

cudf = pytest.importorskip("cudf")


def test_operator_overload():
Expand Down Expand Up @@ -40,20 +39,31 @@ def test_inner_slicing():


def test_string_methods():
s = pa.array([{"s": ["hey", "Ho"], "i": [0]}, {"s": ["Gar", "go"], "i": [2]}],
type=pa.struct([("s", pa.list_(pa.string())), ("i", pa.list_(pa.int32()))]))
s = pa.array(
[{"s": ["hey", "Ho"], "i": [0]}, {"s": ["Gar", "go"], "i": [2]}],
type=pa.struct([("s", pa.list_(pa.string())), ("i", pa.list_(pa.int32()))]),
)
series = cudf.Series(s)
s2 = series.ak.str.upper()
assert s2.ak.to_list() == [{"s": ["HEY", "HO"], "i": [0]}, {"s": ["GAR", "GO"], "i": [2]}]
assert s2.ak.to_list() == [
{"s": ["HEY", "HO"], "i": [0]},
{"s": ["GAR", "GO"], "i": [2]},
]

assert series.ak.str.upper.__doc__
# kwargs
s2 = series.ak.str.replace(pat="h", repl="B")
assert s2.ak.to_list() == [{"s": ["Bey", "Ho"], "i": [0]}, {"s": ["Gar", "go"], "i": [2]}]
assert s2.ak.to_list() == [
{"s": ["Bey", "Ho"], "i": [0]},
{"s": ["Gar", "go"], "i": [2]},
]

# positional args
s2 = series.ak.str.replace("h", "B")
assert s2.ak.to_list() == [{"s": ["Bey", "Ho"], "i": [0]}, {"s": ["Gar", "go"], "i": [2]}]
assert s2.ak.to_list() == [
{"s": ["Bey", "Ho"], "i": [0]},
{"s": ["Gar", "go"], "i": [2]},
]

# non-str output
s2 = series.ak.str.len()
Expand All @@ -63,12 +73,12 @@ def test_string_methods():
def test_cast():
s = cudf.Series([0, 1, 2])
# shows that cast to timestamp needs to be two-step in cudf
s2 = s.ak.cast('m8[s]').ak.cast('M8[s]')
s2 = s.ak.cast("m8[s]").ak.cast("M8[s]")
out = s2.ak.to_list()
assert out == [
datetime.datetime(1970, 1, 1, 0, 0),
datetime.datetime(1970, 1, 1, 0, 0, 1),
datetime.datetime(1970, 1, 1, 0, 0, 2)
datetime.datetime(1970, 1, 1, 0, 0, 2),
]


Expand All @@ -77,7 +87,7 @@ def test_times():
datetime.datetime(1970, 1, 1, 0, 0),
datetime.datetime(1970, 1, 1, 0, 0, 1),
None,
datetime.datetime(1970, 1, 1, 0, 0, 2)
datetime.datetime(1970, 1, 1, 0, 0, 2),
]
arr = ak.Array([[data], [], [data]])
s = akimbo.io.ak_to_series(arr, "cudf")
Expand Down

0 comments on commit 5c12191

Please sign in to comment.