From ee51120118b1e4a04c13df32da923d5805f4f9f9 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 6 Sep 2023 08:14:20 -0500 Subject: [PATCH] [python-package] simplify processing of pandas data (#6066) --- python-package/lightgbm/basic.py | 118 +++++++++++++----------- python-package/lightgbm/plotting.py | 4 +- tests/python_package_test/test_basic.py | 21 ++++- 3 files changed, 83 insertions(+), 60 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 2f061bdacf31..182ec200d207 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -668,57 +668,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None: def _data_from_pandas( - data, - feature_name: Optional[_LGBM_FeatureNameConfiguration], - categorical_feature: Optional[_LGBM_CategoricalFeatureConfiguration], + data: pd_DataFrame, + feature_name: _LGBM_FeatureNameConfiguration, + categorical_feature: _LGBM_CategoricalFeatureConfiguration, pandas_categorical: Optional[List[List]] -): - if isinstance(data, pd_DataFrame): - if len(data.shape) != 2 or data.shape[0] < 1: - raise ValueError('Input data must be 2 dimensional and non empty.') - if feature_name == 'auto' or feature_name is None: - data = data.rename(columns=str, copy=False) - cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)] - cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered] - if pandas_categorical is None: # train dataset - pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] - else: - if len(cat_cols) != len(pandas_categorical): - raise ValueError('train and valid dataset categorical_feature do not match.') - for col, category in zip(cat_cols, pandas_categorical): - if list(data[col].cat.categories) != list(category): - data[col] = data[col].cat.set_categories(category) - if len(cat_cols): # cat_cols is list - data = data.copy(deep=False) # not alter origin DataFrame - data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) - if categorical_feature is not None: - if feature_name is None: - feature_name = list(data.columns) - if categorical_feature == 'auto': # use cat cols from DataFrame - categorical_feature = cat_cols_not_ordered - else: # use cat cols specified by user - categorical_feature = list(categorical_feature) # type: ignore[assignment] - if feature_name == 'auto': - feature_name = list(data.columns) - _check_for_bad_pandas_dtypes(data.dtypes) - df_dtypes = [dtype.type for dtype in data.dtypes] - df_dtypes.append(np.float32) # so that the target dtype considers floats - target_dtype = np.result_type(*df_dtypes) - try: - # most common case (no nullable dtypes) - data = data.to_numpy(dtype=target_dtype, copy=False) - except TypeError: - # 1.0 <= pd version < 1.1 and nullable dtypes, least common case - # raises error because array is casted to type(pd.NA) and there's no na_value argument - data = data.astype(target_dtype, copy=False).values - except ValueError: - # data has nullable dtypes, but we can specify na_value argument and copy will be made - data = data.to_numpy(dtype=target_dtype, na_value=np.nan) +) -> Tuple[np.ndarray, List[str], List[str], List[List]]: + if len(data.shape) != 2 or data.shape[0] < 1: + raise ValueError('Input data must be 2 dimensional and non empty.') + + # determine feature names + if feature_name == 'auto': + feature_name = [str(col) for col in data.columns] + + # determine categorical features + cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)] + cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered] + if pandas_categorical is None: # train dataset + pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] else: - if feature_name == 'auto': - feature_name = None - if categorical_feature == 'auto': - categorical_feature = None + if len(cat_cols) != len(pandas_categorical): + raise ValueError('train and valid dataset categorical_feature do not match.') + for col, category in zip(cat_cols, pandas_categorical): + if list(data[col].cat.categories) != list(category): + data[col] = data[col].cat.set_categories(category) + if len(cat_cols): # cat_cols is list + data = data.copy(deep=False) # not alter origin DataFrame + data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) + if categorical_feature == 'auto': # use cat cols from DataFrame + categorical_feature = cat_cols_not_ordered + else: # use cat cols specified by user + categorical_feature = list(categorical_feature) # type: ignore[assignment] + + # get numpy representation of the data + _check_for_bad_pandas_dtypes(data.dtypes) + df_dtypes = [dtype.type for dtype in data.dtypes] + df_dtypes.append(np.float32) # so that the target dtype considers floats + target_dtype = np.result_type(*df_dtypes) + try: + # most common case (no nullable dtypes) + data = data.to_numpy(dtype=target_dtype, copy=False) + except TypeError: + # 1.0 <= pd version < 1.1 and nullable dtypes, least common case + # raises error because array is casted to type(pd.NA) and there's no na_value argument + data = data.astype(target_dtype, copy=False).values + except ValueError: + # data has nullable dtypes, but we can specify na_value argument and copy will be made + data = data.to_numpy(dtype=target_dtype, na_value=np.nan) return data, feature_name, categorical_feature, pandas_categorical @@ -1004,7 +999,15 @@ def predict( ctypes.c_int(len(data_names)), ) ) - data = _data_from_pandas(data, None, None, self.pandas_categorical)[0] + + if isinstance(data, pd_DataFrame): + data = _data_from_pandas( + data=data, + feature_name="auto", + categorical_feature="auto", + pandas_categorical=self.pandas_categorical + )[0] + predict_type = _C_API_PREDICT_NORMAL if raw_score: predict_type = _C_API_PREDICT_RAW_SCORE @@ -1854,10 +1857,13 @@ def _lazy_init( if reference is not None: self.pandas_categorical = reference.pandas_categorical categorical_feature = reference.categorical_feature - data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data, - feature_name=feature_name, - categorical_feature=categorical_feature, - pandas_categorical=self.pandas_categorical) + if isinstance(data, pd_DataFrame): + data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas( + data=data, + feature_name=feature_name, + categorical_feature=categorical_feature, + pandas_categorical=self.pandas_categorical + ) # process for args params = {} if params is None else params @@ -1867,10 +1873,10 @@ def _lazy_init( _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n' f'Please use {key} argument of the Dataset constructor to pass this parameter.') # get categorical features - if categorical_feature is not None: + if isinstance(categorical_feature, list): categorical_indices = set() feature_dict = {} - if feature_name is not None: + if isinstance(feature_name, list): feature_dict = {name: i for i, name in enumerate(feature_name)} for name in categorical_feature: if isinstance(name, str) and name in feature_dict: diff --git a/python-package/lightgbm/plotting.py b/python-package/lightgbm/plotting.py index f16a4f274313..85b245c187ef 100644 --- a/python-package/lightgbm/plotting.py +++ b/python-package/lightgbm/plotting.py @@ -712,8 +712,8 @@ def create_tree_digraph( if isinstance(example_case, pd_DataFrame): example_case = _data_from_pandas( data=example_case, - feature_name=None, - categorical_feature=None, + feature_name="auto", + categorical_feature="auto", pandas_categorical=booster.pandas_categorical )[0] example_case = example_case[0] diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 267041eae2e4..7f8980c271f7 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): pd = pytest.importorskip('pandas') X = np.random.rand(10, 2).astype(dtype) df = pd.DataFrame(X) - built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] + built_data = lgb.basic._data_from_pandas( + data=df, + feature_name=feature_name, + categorical_feature="auto", + pandas_categorical=None + )[0] assert built_data.dtype == dtype assert np.shares_memory(X, built_data) @@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) column_name = 'a' if feature_name == 'auto' else feature_name[0] df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') - data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] + data = lgb.basic._data_from_pandas( + data=df, + feature_name=feature_name, + categorical_feature="auto", + pandas_categorical=None + )[0] # check that the original data wasn't modified np.testing.assert_equal(df[column_name], X[:, 0]) # check that the built data has the codes @@ -806,3 +816,10 @@ def test_set_leaf_output(): leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id) bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1) np.testing.assert_allclose(bst.predict(X), y_pred + 1) + + +def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset(): + ds = lgb.Dataset( + data=np.random.randn(100, 3), + ) + assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]