Skip to content

Commit

Permalink
ListColumn constructor for cudf 24.10 + np.nan vs np.NaN (#86)
Browse files Browse the repository at this point in the history
* first commit

Signed-off-by: Praateek Mahajan <[email protected]>

* remove dead import

Signed-off-by: Praateek Mahajan <[email protected]>

* add return type hint

Signed-off-by: Praateek Mahajan <[email protected]>

* fixes

Signed-off-by: Praateek Mahajan <[email protected]>

* pre-commit

Signed-off-by: Praateek Mahajan <[email protected]>

* np.NaN -> np.nan

Signed-off-by: Praateek Mahajan <[email protected]>

* fix version checking

Signed-off-by: Praateek Mahajan <[email protected]>

* Update github workflow for GPU Tests (#87)

* refer to 24.08 gh workflows

Signed-off-by: Praateek Mahajan <[email protected]>

* update docker img

Signed-off-by: Praateek Mahajan <[email protected]>

* skip pytest

Signed-off-by: Praateek Mahajan <[email protected]>

---------

Signed-off-by: Praateek Mahajan <[email protected]>

* pre-commit

Signed-off-by: Praateek Mahajan <[email protected]>

* change how version is understood for cudf

Signed-off-by: Praateek Mahajan <[email protected]>

* move import out

Signed-off-by: Praateek Mahajan <[email protected]>

---------

Signed-off-by: Praateek Mahajan <[email protected]>
  • Loading branch information
praateekmahajan authored Sep 24, 2024
1 parent bdcf60f commit 2a0272a
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 25 deletions.
92 changes: 79 additions & 13 deletions crossfit/backend/cudf/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,70 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import lru_cache
from typing import TYPE_CHECKING, Any, Optional

import cudf
import cupy as cp
from cudf.core.column import as_column
from cudf.core.dtypes import ListDtype
from packaging.version import parse as parse_version

if TYPE_CHECKING:
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
from cudf.core.column.numerical import NumericalColumn


@lru_cache
def _is_cudf_gte_24_10():
current_cudf_version = parse_version(cudf.__version__)
cudf_24_10_version = parse_version("24.10.0")

if current_cudf_version >= cudf_24_10_version or (
current_cudf_version.base_version >= "24.10.0" and current_cudf_version.is_prerelease
):
return True
elif current_cudf_version < cudf_24_10_version or (
current_cudf_version.base_version < "24.10.0" and current_cudf_version.is_prerelease
):
return False
else:
msg = f"Found uncaught cudf version {current_cudf_version}"
raise NotImplementedError(msg)


def _construct_series_from_list_column(index: Any, lc: cudf.core.column.ListColumn) -> cudf.Series:
if not _is_cudf_gte_24_10():
return cudf.Series(data=lc, index=index)
else:
from cudf.core.index import ensure_index

return cudf.Series._from_column(column=lc, index=ensure_index(index))


def _construct_list_column(
size: int,
dtype: ListDtype,
mask: Optional["Buffer"] = None,
offset: int = 0,
null_count: Optional[int] = None,
children: tuple["NumericalColumn", "ColumnBase"] = (), # type: ignore[assignment]
) -> cudf.core.column.ListColumn:
kwargs = dict(
size=size,
dtype=dtype,
mask=mask,
offset=offset,
null_count=null_count,
children=children,
)

if not _is_cudf_gte_24_10():
return cudf.core.column.ListColumn(**kwargs)
else:
# in 24.10 ListColumn added `data` kwarg see https://github.com/rapidsai/crossfit/issues/84
return cudf.core.column.ListColumn(data=None, **kwargs)


def create_list_series_from_1d_or_2d_ar(ar, index):
Expand All @@ -32,16 +93,16 @@ def create_list_series_from_1d_or_2d_ar(ar, index):
offset_col = as_column(cp.arange(start=0, stop=len(data) + 1, step=n_cols), dtype="int32")
mask_col = cp.full(shape=n_rows, fill_value=cp.bool_(True))
mask = cudf._lib.transform.bools_to_mask(as_column(mask_col))
lc = cudf.core.column.ListColumn(

lc = _construct_list_column(
size=n_rows,
dtype=cudf.ListDtype(data.dtype),
mask=mask,
offset=0,
null_count=0,
children=(offset_col, data),
)

return cudf.Series(lc, index=index)
return _construct_series_from_list_column(lc=lc, index=index)


def create_nested_list_series_from_3d_ar(ar, index):
Expand All @@ -63,17 +124,22 @@ def create_nested_list_series_from_3d_ar(ar, index):
outer_list_offsets = as_column(outer_offsets)

# Constructing the nested ListColumn
lc = cudf.core.column.ListColumn(
inner_lc = _construct_list_column(
size=inner_offsets.size - 1,
dtype=cudf.ListDtype(inner_list_data.dtype),
children=(inner_list_offsets, inner_list_data),
mask=None,
offset=0,
null_count=None,
)

lc = _construct_list_column(
size=n_slices,
dtype=cudf.ListDtype(inner_list_data.dtype),
children=(
outer_list_offsets,
cudf.core.column.ListColumn(
size=inner_offsets.size - 1,
dtype=cudf.ListDtype(inner_list_data.dtype),
children=(inner_list_offsets, inner_list_data),
),
),
children=(outer_list_offsets, inner_lc),
mask=None,
offset=0,
null_count=None,
)

return cudf.Series(lc, index=index)
return _construct_series_from_list_column(lc=lc, index=index)
2 changes: 1 addition & 1 deletion crossfit/metric/ranking/f1.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ def _score(self, y_true, y_pred_labels):
invalid = np.isnan(product)

scores[valid] = product[valid] / sm[valid]
scores[invalid] = np.NaN
scores[invalid] = np.nan

return scores
2 changes: 1 addition & 1 deletion crossfit/metric/ranking/hitrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@ class HitRate(Recall):
def _score(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
scores = self._recall(y_true, y_pred_labels)
scores[n_pos != 1] = np.NaN # Not defined for no or multiple positives
scores[n_pos != 1] = np.nan # Not defined for no or multiple positives
return scores
4 changes: 2 additions & 2 deletions crossfit/metric/ranking/ndcg.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def _dcg(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
ranks = convert_array(ranks, type(y_pred_labels.data))

scores = np.sum(self._rel_scale(labels) / self._log_fct(ranks + 1), axis=-1)
scores[n_pos == 0] = np.NaN
scores[n_pos == 0] = np.nan
return scores

def _score(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
Expand All @@ -57,6 +57,6 @@ def _score(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
if idcg.shape[0] == 1 and ndcg.shape[0] > 1:
idcg = np.ones_like(ndcg) * idcg

ndcg[idcg == 0] = np.NaN
ndcg[idcg == 0] = np.nan

return dcg / idcg
6 changes: 3 additions & 3 deletions crossfit/metric/ranking/precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _precision(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
n_items_in_y_pred = items.sum(axis=1).flatten()

# not defined if there are no relevant labels
scores = np.NaN * np.zeros_like(n_relevant, dtype=float)
scores = np.nan * np.zeros_like(n_relevant, dtype=float)
valid = (n_items_in_y_pred > 0) & (n_pos > 0)

scores[valid] = n_relevant[valid].astype(float) / np.minimum(
Expand All @@ -45,7 +45,7 @@ def _precision(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
else:
scores = n_relevant.astype(float) / self._k
# not defined if there are no relevant labels
scores[n_pos == 0] = np.NaN
scores[n_pos == 0] = np.nan

return scores

Expand All @@ -68,6 +68,6 @@ def _score(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
scores[n_pos > 0] = np.sum(precision * relevant, axis=-1)[n_pos > 0] / np.clip(
n_pos[n_pos > 0], None, self._k
)
scores[n_pos == 0] = np.NaN
scores[n_pos == 0] = np.nan

return scores
6 changes: 3 additions & 3 deletions crossfit/metric/ranking/rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def _score(self, y_true: SparseBinaryLabels, y_pred_labels: MaskedArray):

# It is 1/rank if document appears in top k, 0 otherwise
scores = np.max(labels / ranks, axis=-1, initial=0.0)
scores[n_pos == 0] = np.NaN # Not defined for no multiple positives
scores[n_pos == 0] = np.nan # Not defined for no multiple positives

return scores

Expand All @@ -42,7 +42,7 @@ def _score(self, y_true: SparseBinaryLabels, y_pred_labels: MaskedArray):

scores = np.sum(ranks * labels, axis=-1)
scores[n_pos > 0] = scores[n_pos > 0] / n_pos[n_pos > 0]
scores[n_pos == 0] = np.NaN
scores[n_pos == 0] = np.nan
return scores


Expand All @@ -58,6 +58,6 @@ def _score(self, y_true: SparseBinaryLabels, y_pred_labels: MaskedArray):
ranks = ranks * labels
ranks[ranks == 0] = np.inf
scores = np.min(ranks, axis=-1)
scores[n_pos == 0] = np.NaN
scores[n_pos == 0] = np.nan

return scores
2 changes: 1 addition & 1 deletion crossfit/metric/ranking/recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def _recall(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
axis=-1,
)

scores = np.NaN * np.zeros_like(n_relevant, dtype=float)
scores = np.nan * np.zeros_like(n_relevant, dtype=float)
if self._truncated:
denominator = np.clip(n_pos[n_pos > 0], None, self._k)
else:
Expand Down
1 change: 0 additions & 1 deletion tests/backend/cudf_backend/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def test_create_nested_list_series_from_3d_ar():
tensor = torch.tensor(nested_list)
index = [1, 2]
series = create_nested_list_series_from_3d_ar(tensor, index)
print(series)
assert isinstance(series, cudf.Series)
expected = cudf.Series(nested_list, index=index)
# convert to pandas because cudf.Series.equals doesn't work for list series
Expand Down

0 comments on commit 2a0272a

Please sign in to comment.