ListColumn constructor for cudf 24.10 + np.nan vs np.NaN (#86)

* first commit Signed-off-by: Praateek Mahajan <[email protected]> * remove dead import Signed-off-by: Praateek Mahajan <[email protected]> * add return type hint Signed-off-by: Praateek Mahajan <[email protected]> * fixes Signed-off-by: Praateek Mahajan <[email protected]> * pre-commit Signed-off-by: Praateek Mahajan <[email protected]> * np.NaN -> np.nan Signed-off-by: Praateek Mahajan <[email protected]> * fix version checking Signed-off-by: Praateek Mahajan <[email protected]> * Update github workflow for GPU Tests (#87) * refer to 24.08 gh workflows Signed-off-by: Praateek Mahajan <[email protected]> * update docker img Signed-off-by: Praateek Mahajan <[email protected]> * skip pytest Signed-off-by: Praateek Mahajan <[email protected]> --------- Signed-off-by: Praateek Mahajan <[email protected]> * pre-commit Signed-off-by: Praateek Mahajan <[email protected]> * change how version is understood for cudf Signed-off-by: Praateek Mahajan <[email protected]> * move import out Signed-off-by: Praateek Mahajan <[email protected]> --------- Signed-off-by: Praateek Mahajan <[email protected]>
rapidsai · Sep 24, 2024 · 2a0272a · 2a0272a
1 parent bdcf60f
commit 2a0272a
Show file tree

Hide file tree

Showing 8 changed files with 90 additions and 25 deletions.
diff --git a/crossfit/backend/cudf/series.py b/crossfit/backend/cudf/series.py
@@ -12,9 +12,70 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any, Optional
+
 import cudf
 import cupy as cp
 from cudf.core.column import as_column
+from cudf.core.dtypes import ListDtype
+from packaging.version import parse as parse_version
+
+if TYPE_CHECKING:
+    from cudf.core.buffer import Buffer
+    from cudf.core.column import ColumnBase
+    from cudf.core.column.numerical import NumericalColumn
+
+
+@lru_cache
+def _is_cudf_gte_24_10():
+    current_cudf_version = parse_version(cudf.__version__)
+    cudf_24_10_version = parse_version("24.10.0")
+
+    if current_cudf_version >= cudf_24_10_version or (
+        current_cudf_version.base_version >= "24.10.0" and current_cudf_version.is_prerelease
+    ):
+        return True
+    elif current_cudf_version < cudf_24_10_version or (
+        current_cudf_version.base_version < "24.10.0" and current_cudf_version.is_prerelease
+    ):
+        return False
+    else:
+        msg = f"Found uncaught cudf version {current_cudf_version}"
+        raise NotImplementedError(msg)
+
+
+def _construct_series_from_list_column(index: Any, lc: cudf.core.column.ListColumn) -> cudf.Series:
+    if not _is_cudf_gte_24_10():
+        return cudf.Series(data=lc, index=index)
+    else:
+        from cudf.core.index import ensure_index
+
+        return cudf.Series._from_column(column=lc, index=ensure_index(index))
+
+
+def _construct_list_column(
+    size: int,
+    dtype: ListDtype,
+    mask: Optional["Buffer"] = None,
+    offset: int = 0,
+    null_count: Optional[int] = None,
+    children: tuple["NumericalColumn", "ColumnBase"] = (),  # type: ignore[assignment]
+) -> cudf.core.column.ListColumn:
+    kwargs = dict(
+        size=size,
+        dtype=dtype,
+        mask=mask,
+        offset=offset,
+        null_count=null_count,
+        children=children,
+    )
+
+    if not _is_cudf_gte_24_10():
+        return cudf.core.column.ListColumn(**kwargs)
+    else:
+        # in 24.10 ListColumn added `data` kwarg see https://github.com/rapidsai/crossfit/issues/84
+        return cudf.core.column.ListColumn(data=None, **kwargs)
 
 
 def create_list_series_from_1d_or_2d_ar(ar, index):
@@ -32,16 +93,16 @@ def create_list_series_from_1d_or_2d_ar(ar, index):
     offset_col = as_column(cp.arange(start=0, stop=len(data) + 1, step=n_cols), dtype="int32")
     mask_col = cp.full(shape=n_rows, fill_value=cp.bool_(True))
     mask = cudf._lib.transform.bools_to_mask(as_column(mask_col))
-    lc = cudf.core.column.ListColumn(
+
+    lc = _construct_list_column(
         size=n_rows,
         dtype=cudf.ListDtype(data.dtype),
         mask=mask,
         offset=0,
         null_count=0,
         children=(offset_col, data),
     )
-
-    return cudf.Series(lc, index=index)
+    return _construct_series_from_list_column(lc=lc, index=index)
 
 
 def create_nested_list_series_from_3d_ar(ar, index):
@@ -63,17 +124,22 @@ def create_nested_list_series_from_3d_ar(ar, index):
     outer_list_offsets = as_column(outer_offsets)
 
     # Constructing the nested ListColumn
-    lc = cudf.core.column.ListColumn(
+    inner_lc = _construct_list_column(
+        size=inner_offsets.size - 1,
+        dtype=cudf.ListDtype(inner_list_data.dtype),
+        children=(inner_list_offsets, inner_list_data),
+        mask=None,
+        offset=0,
+        null_count=None,
+    )
+
+    lc = _construct_list_column(
         size=n_slices,
         dtype=cudf.ListDtype(inner_list_data.dtype),
-        children=(
-            outer_list_offsets,
-            cudf.core.column.ListColumn(
-                size=inner_offsets.size - 1,
-                dtype=cudf.ListDtype(inner_list_data.dtype),
-                children=(inner_list_offsets, inner_list_data),
-            ),
-        ),
+        children=(outer_list_offsets, inner_lc),
+        mask=None,
+        offset=0,
+        null_count=None,
     )
 
-    return cudf.Series(lc, index=index)
+    return _construct_series_from_list_column(lc=lc, index=index)
diff --git a/crossfit/metric/ranking/f1.py b/crossfit/metric/ranking/f1.py
@@ -32,6 +32,6 @@ def _score(self, y_true, y_pred_labels):
         invalid = np.isnan(product)
 
         scores[valid] = product[valid] / sm[valid]
-        scores[invalid] = np.NaN
+        scores[invalid] = np.nan
 
         return scores
diff --git a/crossfit/metric/ranking/hitrate.py b/crossfit/metric/ranking/hitrate.py
@@ -21,5 +21,5 @@ class HitRate(Recall):
     def _score(self, y_true, y_pred_labels):
         n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
         scores = self._recall(y_true, y_pred_labels)
-        scores[n_pos != 1] = np.NaN  # Not defined for no or multiple positives
+        scores[n_pos != 1] = np.nan  # Not defined for no or multiple positives
         return scores
diff --git a/crossfit/metric/ranking/ndcg.py b/crossfit/metric/ranking/ndcg.py
@@ -39,7 +39,7 @@ def _dcg(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
         ranks = convert_array(ranks, type(y_pred_labels.data))
 
         scores = np.sum(self._rel_scale(labels) / self._log_fct(ranks + 1), axis=-1)
-        scores[n_pos == 0] = np.NaN
+        scores[n_pos == 0] = np.nan
         return scores
 
     def _score(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
@@ -57,6 +57,6 @@ def _score(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
         if idcg.shape[0] == 1 and ndcg.shape[0] > 1:
             idcg = np.ones_like(ndcg) * idcg
 
-        ndcg[idcg == 0] = np.NaN
+        ndcg[idcg == 0] = np.nan
 
         return dcg / idcg
diff --git a/crossfit/metric/ranking/precision.py b/crossfit/metric/ranking/precision.py
@@ -36,7 +36,7 @@ def _precision(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
             n_items_in_y_pred = items.sum(axis=1).flatten()
 
             # not defined if there are no relevant labels
-            scores = np.NaN * np.zeros_like(n_relevant, dtype=float)
+            scores = np.nan * np.zeros_like(n_relevant, dtype=float)
             valid = (n_items_in_y_pred > 0) & (n_pos > 0)
 
             scores[valid] = n_relevant[valid].astype(float) / np.minimum(
@@ -45,7 +45,7 @@ def _precision(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
         else:
             scores = n_relevant.astype(float) / self._k
             # not defined if there are no relevant labels
-            scores[n_pos == 0] = np.NaN
+            scores[n_pos == 0] = np.nan
 
         return scores
 
@@ -68,6 +68,6 @@ def _score(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
         scores[n_pos > 0] = np.sum(precision * relevant, axis=-1)[n_pos > 0] / np.clip(
             n_pos[n_pos > 0], None, self._k
         )
-        scores[n_pos == 0] = np.NaN
+        scores[n_pos == 0] = np.nan
 
         return scores
diff --git a/crossfit/metric/ranking/rank.py b/crossfit/metric/ranking/rank.py
@@ -26,7 +26,7 @@ def _score(self, y_true: SparseBinaryLabels, y_pred_labels: MaskedArray):
 
         # It is 1/rank if document appears in top k, 0 otherwise
         scores = np.max(labels / ranks, axis=-1, initial=0.0)
-        scores[n_pos == 0] = np.NaN  # Not defined for no multiple positives
+        scores[n_pos == 0] = np.nan  # Not defined for no multiple positives
 
         return scores
 
@@ -42,7 +42,7 @@ def _score(self, y_true: SparseBinaryLabels, y_pred_labels: MaskedArray):
 
         scores = np.sum(ranks * labels, axis=-1)
         scores[n_pos > 0] = scores[n_pos > 0] / n_pos[n_pos > 0]
-        scores[n_pos == 0] = np.NaN
+        scores[n_pos == 0] = np.nan
         return scores
 
 
@@ -58,6 +58,6 @@ def _score(self, y_true: SparseBinaryLabels, y_pred_labels: MaskedArray):
         ranks = ranks * labels
         ranks[ranks == 0] = np.inf
         scores = np.min(ranks, axis=-1)
-        scores[n_pos == 0] = np.NaN
+        scores[n_pos == 0] = np.nan
 
         return scores
diff --git a/crossfit/metric/ranking/recall.py b/crossfit/metric/ranking/recall.py
@@ -30,7 +30,7 @@ def _recall(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
             axis=-1,
         )
 
-        scores = np.NaN * np.zeros_like(n_relevant, dtype=float)
+        scores = np.nan * np.zeros_like(n_relevant, dtype=float)
         if self._truncated:
             denominator = np.clip(n_pos[n_pos > 0], None, self._k)
         else:

diff --git a/tests/backend/cudf_backend/test_series.py b/tests/backend/cudf_backend/test_series.py
@@ -37,7 +37,6 @@ def test_create_nested_list_series_from_3d_ar():
     tensor = torch.tensor(nested_list)
     index = [1, 2]
     series = create_nested_list_series_from_3d_ar(tensor, index)
-    print(series)
     assert isinstance(series, cudf.Series)
     expected = cudf.Series(nested_list, index=index)
     # convert to pandas because cudf.Series.equals doesn't work for list series