Merge pull request #80 from martindurant/cudf-docs

Cudf docs
intake · Oct 21, 2024 · 5c12191 · 5c12191
2 parents f6f2ad7 + c1ca9e1
commit 5c12191
Show file tree

Hide file tree

Showing 7 changed files with 129 additions and 48 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -24,9 +24,9 @@ Accessor
 .. autosummary::
    :toctree: generated/
 
-   akimbo.mixin.Accessor
+   Accessor
 
-.. autoclass:: akimbo.mixin.Accessor
+.. autoclass:: Accessor
    :members:
 
 
@@ -37,13 +37,16 @@ Backends
     akimbo.pandas.PandasAwkwardAccessor
     akimbo.dask.DaskAwkwardAccessor
     akimbo.polars.PolarsAwkwardAccessor
+    akimbo.cudf.CudfAwkwardAccessor
 
 .. autoclass:: akimbo.pandas.PandasAwkwardAccessor
 
 .. autoclass:: akimbo.dask.DaskAwkwardAccessor
 
 .. autoclass:: akimbo.polars.PolarsAwkwardAccessor
 
+.. autoclass:: akimbo.cudf.CudfAwkwardAccessor
+
 
 Extensions
 ~~~~~~~~~~
@@ -54,7 +57,7 @@ being acted on. Check the ``dir()`` of each (or use tab-completion)
 to see the operations available.
 
 .. autoclass:: akimbo.datetimes.DatetimeAccessor
-   :members: cast
+   :members:
 
 .. autoclass:: akimbo.strings.StringAccessor
    :members:
@@ -63,3 +66,6 @@ to see the operations available.
 
     <script data-goatcounter="https://akimbo.goatcounter.com/count"
             async src="//gc.zgo.at/count.js"></script>
+
+The cuDF backend also has these implemented with GPU-specific variants,
+``akimbo.cudf.CudfStringAccessor`` and ``akimbo.cudf.CudfDatetimeAccessor``.
diff --git a/docs/index.rst b/docs/index.rst
@@ -23,7 +23,7 @@ identical syntax:
 - pandas
 - dask.dataframe
 - polars
-- cuDF (in development)
+- cuDF
 
 
 numpy-like API
@@ -34,6 +34,7 @@ for slicing and accessing data deep in nested structures,
 Example: choose every second inner element in a list-of-lists
 
 .. code-block:: python
+
     series.ak[:, ::2]
 
 Any function, ufunc or aggregation at any level
@@ -43,6 +44,7 @@ For manipulating numerics at deeper levels of your nested structures or
 ragged arrays while maintaining the original layout
 
 .. code-block:: python
+
     series.ak.abs()  # absolute for all numerical values
     series.ak.sum(axis=3)  # sum over deeply nested level
     series.ak + 1  # numpy-like broadcasting into deeper levels
@@ -52,6 +54,7 @@ arrays of values, and they will only affect the appropriate parts of the structu
 without changing the layout.
 
 .. code-block:: python
+
     series.ak.str.upper()
 
 CPU/GPU numba support
@@ -64,6 +67,7 @@ in groupby/window operations. If your data is on the GPU, you can
 use numba-cuda with slight modifications to your original function.
 
 .. code-block:: python
+
     @numba.njit
     def sum_list_of_list(x):
         total = 0

diff --git a/src/akimbo/ak_from_cudf.py b/src/akimbo/ak_from_cudf.py
@@ -1,12 +1,18 @@
-import cudf
-import pyarrow
-import cupy
-import numpy
+from typing import Optional
+
+from akimbo.utils import NoAttributes
 
+try:
+    import cudf
+    import cupy
+except ImportError:
+    cudf = NoAttributes()
+    cupy = NoAttributes()
 import awkward as ak
-from awkward._backends.numpy import NumpyBackend
+import numpy
+import pyarrow
 from awkward._backends.cupy import CupyBackend
-
+from awkward._backends.numpy import NumpyBackend
 
 # COPIED from awkward/studies/cudf-to-awkward.py
 
@@ -351,7 +357,7 @@ def remove_revertable(layout, **kwargs):
 def recurse_finalize(
     out: ak.contents.Content,
     column: cudf.core.column.column.ColumnBase,
-    validbits: None | cudf.core.buffer.buffer.Buffer,
+    validbits: Optional[cudf.core.buffer.buffer.Buffer],
     generate_bitmasks: bool,
     fix_offsets: bool = True,
 ):
@@ -569,13 +575,15 @@ def recurse(
         validbits = column.base_mask
 
         to64, dt = _pyarrow_to_numpy_dtype.get(str(arrow_type), (False, None))
-        if to64:
-            data = cupy.asarray(data).view(cupy.int32).astype(cupy.int64)
         if dt is None:
             dt = arrow_type.to_pandas_dtype()
+        if to64:
+            data = cupy.asarray(column.base_data).view(cupy.int32).astype(cupy.int64)
+        else:
+            data = cupy.asarray(column.base_data)
 
         out = ak.contents.NumpyArray(
-            cupy.asarray(column.base_data).view(dt),
+            data.view(dt),
             parameters=None,
             backend=CupyBackend.instance(),
         )

diff --git a/src/akimbo/apply_tree.py b/src/akimbo/apply_tree.py
@@ -1,16 +1,15 @@
+from __future__ import annotations
+
 import functools
 import inspect
-from typing import Sequence
+from typing import Callable, Literal, Sequence
 
 import awkward as ak
 import pyarrow as pa
 
 
-class NoDtype:
-    kind = ""
-
-
 def leaf(*layout, **_):
+    """True for the lowest elements of any akwward layout tree"""
     return layout[0].is_leaf
 
 
@@ -42,8 +41,22 @@ def func(layout, **kwargs):
     return ak.transform(func, arr, *others)
 
 
-def dec(func, match=leaf, outtype=None, inmode="arrow"):
-    """Make a nested/ragged version of an operation to apply throughout a tree"""
+def dec(
+    func: callable,
+    match: Callable[[ak.contents.Content], bool] = leaf,
+    outtype: Callable[[ak.contents.Content], ak.contents.Content] | None = None,
+    inmode: Literal["arrow", "numpy", "ak"] = "arrow",
+):
+    """Make a nested/ragged version of an operation to apply throughout a tree
+
+    Parameters
+    ----------
+    func: which we want to apply to (parts of) inputted data
+    match: function to determine if a part of the data structure matches the type we want to
+        operate on
+    outtype: postprocessing function after transform
+    inmode: how ``func`` expects its inputs: as awkward arrays (ak), numpy or arrow
+    """
 
     @functools.wraps(func)
     def f(self, *args, where=None, match_kwargs=None, **kwargs):
@@ -110,8 +123,8 @@ def f(self, *args, where=None, match_kwargs=None, **kwargs):
 match_kwargs: None | dict
     any extra field identifiers for matching a record as OK to process
 
-{'-Kernel documentation follows from the original function-' if f.__doc__ else ''}
-===
+{'--Kernel documentation follows from the original function--' if f.__doc__ else ''}
+
 {f.__doc__ or str(f)}
 """
 

diff --git a/src/akimbo/cudf.py b/src/akimbo/cudf.py
@@ -2,16 +2,28 @@
 from typing import Callable
 
 import awkward as ak
-import cudf
-from cudf import DataFrame, Series, _lib as libcudf
-from cudf.core.column.string import StringMethods
-from cudf.core.column.datetime import DatetimeColumn
+
+from akimbo.utils import NoAttributes
+
+try:
+    import cudf
+    from cudf import DataFrame, Series
+    from cudf import _lib as libcudf
+    from cudf.core.column.datetime import DatetimeColumn
+    from cudf.core.column.string import StringMethods
+except ImportError:
+    StringMethods = NoAttributes()
+    DatetimeColumn = NoAttributes()
+    libcudf = NoAttributes()
+    DataFrame = Series = NoAttributes()
+
 
 from akimbo.ak_from_cudf import cudf_to_awkward as from_cudf
+from akimbo.apply_tree import dec, leaf
+from akimbo.datetimes import DatetimeAccessor
+from akimbo.datetimes import match as match_t
 from akimbo.mixin import Accessor
-from akimbo.datetimes import DatetimeAccessor, match as match_t
 from akimbo.strings import StringAccessor
-from akimbo.apply_tree import dec, leaf
 
 
 def match_string(arr):
@@ -22,14 +34,15 @@ class CudfStringAccessor(StringAccessor):
     """String operations on nested/var-length data"""
 
     def decode(self, encoding: str = "utf-8"):
-        raise NotImplementedError("cudf does not support bytearray type, so we can't automatically identify them")
+        raise NotImplementedError(
+            "cudf does not support bytearray type, so we can't automatically identify them"
+        )
 
     def encode(self, encoding: str = "utf-8"):
         raise NotImplementedError("cudf does not support bytearray type")
 
 
 def dec_cu(op, match=match_string):
-
     @functools.wraps(op)
     def f(lay, **kwargs):
         # op(column, ...)->column
@@ -47,14 +60,15 @@ def f(lay, **kwargs):
     def f(lay, method=meth, **kwargs):
         # this is different from dec_cu, because we need to instantiate StringMethods
         # before getting the method from it
-        col = getattr(StringMethods(cudf.Series(lay._to_cudf(cudf, None, len(lay)))), method)(**kwargs)
+        col = getattr(
+            StringMethods(cudf.Series(lay._to_cudf(cudf, None, len(lay)))), method
+        )(**kwargs)
         return from_cudf(col).layout
 
     setattr(CudfStringAccessor, meth, dec(func=f, match=match_string, inmode="ak"))
 
 
 class CudfDatetimeAccessor(DatetimeAccessor):
-
     ...
 
 
@@ -76,7 +90,11 @@ def f(lay, method=meth, **kwargs):
         return from_cudf(cudf.Series(col)).layout
 
     if isinstance(getattr(DatetimeColumn, meth), property):
-        setattr(CudfDatetimeAccessor, meth, property(dec(func=f, match=match_t, inmode="ak")))
+        setattr(
+            CudfDatetimeAccessor,
+            meth,
+            property(dec(func=f, match=match_t, inmode="ak")),
+        )
     else:
         setattr(CudfDatetimeAccessor, meth, dec(func=f, match=match_t, inmode="ak"))
 

diff --git a/src/akimbo/utils.py b/src/akimbo/utils.py
@@ -0,0 +1,22 @@
+class NoAttributes:
+    """Allows importing akimbo.cudf even if cudf isn't installed
+
+    This is done so that sphinx can still build docs on non-GPU systems.
+    """
+
+    def __dir__(self):
+        return []
+
+    def __getattr__(self, item):
+        if item == "__qualname__":
+            return "akimbo.utils.DummyAttributesObject"
+        if item == "__type_params__":
+            return ()
+        return self
+
+    def __call__(self, *args, **kwargs):
+        return self
+
+    __name__ = "DummyAttributesObject"
+    __doc__ = None
+    __annotations__ = None
diff --git a/tests/test_cudf.py b/tests/test_cudf.py
@@ -1,14 +1,13 @@
 import datetime
 
-import pytest
-
-import pyarrow as pa
 import awkward as ak
+import pyarrow as pa
+import pytest
 
-pytest.importorskip("akimbo.cudf")
-
+import akimbo.cudf
 import akimbo.io
-import cudf
+
+cudf = pytest.importorskip("cudf")
 
 
 def test_operator_overload():
@@ -40,20 +39,31 @@ def test_inner_slicing():
 
 
 def test_string_methods():
-    s = pa.array([{"s": ["hey", "Ho"], "i": [0]}, {"s": ["Gar", "go"], "i": [2]}],
-                 type=pa.struct([("s", pa.list_(pa.string())), ("i", pa.list_(pa.int32()))]))
+    s = pa.array(
+        [{"s": ["hey", "Ho"], "i": [0]}, {"s": ["Gar", "go"], "i": [2]}],
+        type=pa.struct([("s", pa.list_(pa.string())), ("i", pa.list_(pa.int32()))]),
+    )
     series = cudf.Series(s)
     s2 = series.ak.str.upper()
-    assert s2.ak.to_list() == [{"s": ["HEY", "HO"], "i": [0]}, {"s": ["GAR", "GO"], "i": [2]}]
+    assert s2.ak.to_list() == [
+        {"s": ["HEY", "HO"], "i": [0]},
+        {"s": ["GAR", "GO"], "i": [2]},
+    ]
 
     assert series.ak.str.upper.__doc__
     # kwargs
     s2 = series.ak.str.replace(pat="h", repl="B")
-    assert s2.ak.to_list() == [{"s": ["Bey", "Ho"], "i": [0]}, {"s": ["Gar", "go"], "i": [2]}]
+    assert s2.ak.to_list() == [
+        {"s": ["Bey", "Ho"], "i": [0]},
+        {"s": ["Gar", "go"], "i": [2]},
+    ]
 
     # positional args
     s2 = series.ak.str.replace("h", "B")
-    assert s2.ak.to_list() == [{"s": ["Bey", "Ho"], "i": [0]}, {"s": ["Gar", "go"], "i": [2]}]
+    assert s2.ak.to_list() == [
+        {"s": ["Bey", "Ho"], "i": [0]},
+        {"s": ["Gar", "go"], "i": [2]},
+    ]
 
     # non-str output
     s2 = series.ak.str.len()
@@ -63,12 +73,12 @@ def test_string_methods():
 def test_cast():
     s = cudf.Series([0, 1, 2])
     # shows that cast to timestamp needs to be two-step in cudf
-    s2 = s.ak.cast('m8[s]').ak.cast('M8[s]')
+    s2 = s.ak.cast("m8[s]").ak.cast("M8[s]")
     out = s2.ak.to_list()
     assert out == [
         datetime.datetime(1970, 1, 1, 0, 0),
         datetime.datetime(1970, 1, 1, 0, 0, 1),
-        datetime.datetime(1970, 1, 1, 0, 0, 2)
+        datetime.datetime(1970, 1, 1, 0, 0, 2),
     ]
 
 
@@ -77,7 +87,7 @@ def test_times():
         datetime.datetime(1970, 1, 1, 0, 0),
         datetime.datetime(1970, 1, 1, 0, 0, 1),
         None,
-        datetime.datetime(1970, 1, 1, 0, 0, 2)
+        datetime.datetime(1970, 1, 1, 0, 0, 2),
     ]
     arr = ak.Array([[data], [], [data]])
     s = akimbo.io.ak_to_series(arr, "cudf")