Skip to content

Commit

Permalink
[python-package] simplify processing of pandas data (#6066)
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb authored Sep 6, 2023
1 parent 8203306 commit ee51120
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 60 deletions.
118 changes: 62 additions & 56 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,57 +668,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:


def _data_from_pandas(
data,
feature_name: Optional[_LGBM_FeatureNameConfiguration],
categorical_feature: Optional[_LGBM_CategoricalFeatureConfiguration],
data: pd_DataFrame,
feature_name: _LGBM_FeatureNameConfiguration,
categorical_feature: _LGBM_CategoricalFeatureConfiguration,
pandas_categorical: Optional[List[List]]
):
if isinstance(data, pd_DataFrame):
if len(data.shape) != 2 or data.shape[0] < 1:
raise ValueError('Input data must be 2 dimensional and non empty.')
if feature_name == 'auto' or feature_name is None:
data = data.rename(columns=str, copy=False)
cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is list
data = data.copy(deep=False) # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto': # use cat cols from DataFrame
categorical_feature = cat_cols_not_ordered
else: # use cat cols specified by user
categorical_feature = list(categorical_feature) # type: ignore[assignment]
if feature_name == 'auto':
feature_name = list(data.columns)
_check_for_bad_pandas_dtypes(data.dtypes)
df_dtypes = [dtype.type for dtype in data.dtypes]
df_dtypes.append(np.float32) # so that the target dtype considers floats
target_dtype = np.result_type(*df_dtypes)
try:
# most common case (no nullable dtypes)
data = data.to_numpy(dtype=target_dtype, copy=False)
except TypeError:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
data = data.astype(target_dtype, copy=False).values
except ValueError:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
) -> Tuple[np.ndarray, List[str], List[str], List[List]]:
if len(data.shape) != 2 or data.shape[0] < 1:
raise ValueError('Input data must be 2 dimensional and non empty.')

# determine feature names
if feature_name == 'auto':
feature_name = [str(col) for col in data.columns]

# determine categorical features
cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is list
data = data.copy(deep=False) # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature == 'auto': # use cat cols from DataFrame
categorical_feature = cat_cols_not_ordered
else: # use cat cols specified by user
categorical_feature = list(categorical_feature) # type: ignore[assignment]

# get numpy representation of the data
_check_for_bad_pandas_dtypes(data.dtypes)
df_dtypes = [dtype.type for dtype in data.dtypes]
df_dtypes.append(np.float32) # so that the target dtype considers floats
target_dtype = np.result_type(*df_dtypes)
try:
# most common case (no nullable dtypes)
data = data.to_numpy(dtype=target_dtype, copy=False)
except TypeError:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
data = data.astype(target_dtype, copy=False).values
except ValueError:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
return data, feature_name, categorical_feature, pandas_categorical


Expand Down Expand Up @@ -1004,7 +999,15 @@ def predict(
ctypes.c_int(len(data_names)),
)
)
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]

if isinstance(data, pd_DataFrame):
data = _data_from_pandas(
data=data,
feature_name="auto",
categorical_feature="auto",
pandas_categorical=self.pandas_categorical
)[0]

predict_type = _C_API_PREDICT_NORMAL
if raw_score:
predict_type = _C_API_PREDICT_RAW_SCORE
Expand Down Expand Up @@ -1854,10 +1857,13 @@ def _lazy_init(
if reference is not None:
self.pandas_categorical = reference.pandas_categorical
categorical_feature = reference.categorical_feature
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data,
feature_name=feature_name,
categorical_feature=categorical_feature,
pandas_categorical=self.pandas_categorical)
if isinstance(data, pd_DataFrame):
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(
data=data,
feature_name=feature_name,
categorical_feature=categorical_feature,
pandas_categorical=self.pandas_categorical
)

# process for args
params = {} if params is None else params
Expand All @@ -1867,10 +1873,10 @@ def _lazy_init(
_log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
f'Please use {key} argument of the Dataset constructor to pass this parameter.')
# get categorical features
if categorical_feature is not None:
if isinstance(categorical_feature, list):
categorical_indices = set()
feature_dict = {}
if feature_name is not None:
if isinstance(feature_name, list):
feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature:
if isinstance(name, str) and name in feature_dict:
Expand Down
4 changes: 2 additions & 2 deletions python-package/lightgbm/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,8 +712,8 @@ def create_tree_digraph(
if isinstance(example_case, pd_DataFrame):
example_case = _data_from_pandas(
data=example_case,
feature_name=None,
categorical_feature=None,
feature_name="auto",
categorical_feature="auto",
pandas_categorical=booster.pandas_categorical
)[0]
example_case = example_case[0]
Expand Down
21 changes: 19 additions & 2 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
pd = pytest.importorskip('pandas')
X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X)
built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
built_data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
)[0]
assert built_data.dtype == dtype
assert np.shares_memory(X, built_data)

Expand All @@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
column_name = 'a' if feature_name == 'auto' else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
)[0]
# check that the original data wasn't modified
np.testing.assert_equal(df[column_name], X[:, 0])
# check that the built data has the codes
Expand Down Expand Up @@ -806,3 +816,10 @@ def test_set_leaf_output():
leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id)
bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1)
np.testing.assert_allclose(bst.predict(X), y_pred + 1)


def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset():
ds = lgb.Dataset(
data=np.random.randn(100, 3),
)
assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]

0 comments on commit ee51120

Please sign in to comment.