koaning · FBruzzesi · Dec 29, 2023 · Jan 6, 2024 · FBruzzesi · Dec 29, 2023
diff --git a/sklego/model_selection.py b/sklego/model_selection.py
@@ -131,7 +131,7 @@ def split(self, X, y=None, groups=None):
 
         if len(X) != len(X_index_df):
             raise AssertionError(
-                "X and X_index_df are not the same length, " "there must be some index missing in 'self.date_serie'"
+                "X and X_index_df are not the same length, there must be some index missing in 'self.date_serie'"
             )
 
         date_min = X_index_df["__date__"].min()
@@ -148,11 +148,8 @@ def split(self, X, y=None, groups=None):
         if self.n_splits:
             if n_split_max < self.n_splits:
                 raise ValueError(
-                    (
-                        "Number of folds requested = {1} are greater"
-                        " than maximum  ={0} possible without"
-                        " overlapping validation sets."
-                    ).format(n_split_max, self.n_splits)
+                    f"Number of folds requested = {self.n_splits} are greater than maximum (={n_split_max}) possible"
+                    " without overlapping validation sets."
                 )
 
         current_date = date_min
@@ -345,16 +342,14 @@ class GroupTimeSeriesSplit(_BaseKFold):
     def __init__(self, n_splits):
         if not isinstance(n_splits, numbers.Integral):
             raise ValueError(
-                "The number of folds must be of Integral type. "
-                "%s of type %s was passed." % (n_splits, type(n_splits))
+                f"The number of folds must be of Integral type. {n_splits} of type {type(n_splits)} was passed."
             )
         n_splits = int(n_splits)
 
         if n_splits <= 1:
             raise ValueError(
-                "k-fold cross-validation requires at least one"
-                " train/test split by setting n_splits=2 or more,"
-                " got n_splits={0}.".format(n_splits)
+                "k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more,"
+                f" got n_splits={n_splits}."
             )
 
         self.n_splits = n_splits
@@ -371,11 +366,14 @@ def summary(self):
         """
         try:
             return (
-                self._grouped_df.sort_index()
-                .assign(group=lambda df: df["group"].astype(int))
-                .assign(obs_per_group=lambda df: df.groupby("group")["observations"].transform("sum"))
-                .assign(ideal_group_size=round(self._ideal_group_size))
-                .assign(diff_from_ideal_group_size=lambda df: df["obs_per_group"] - df["ideal_group_size"])
+                pd.DataFrame({"group": self._group, "observations": self._obs_per_grp, "index": self._index})
+                .sort_values("index")
+                .assign(
+                    group=lambda df: df["group"].astype(int),
+                    obs_per_group=lambda df: df.groupby("group")["observations"].transform("sum"),
+                    ideal_group_size=round(self._ideal_group_size),
+                    diff_from_ideal_group_size=lambda df: df["obs_per_group"] - df["ideal_group_size"],
+                )
             )
         except AttributeError:
             raise AttributeError(".summary() only works after having ran .split(X, y, groups).")
@@ -402,9 +400,8 @@ def split(self, X=None, y=None, groups=None):
         X, y, groups = indexable(X, y, groups)
         n_groups = np.unique(groups).shape[0]
         if self.n_splits >= n_groups:
-            raise ValueError(
-                ("n_splits({0}) must be less than the amount" " of unique groups({1}).").format(self.n_splits, n_groups)
-            )
+            raise ValueError(f"n_splits({self.n_splits}) must be less than the amount" " of unique groups({n_groups}).")
+
         return list(self._iter_test_indices(X, y, groups))
 
     def get_n_splits(self, X=None, y=None, groups=None):
@@ -442,27 +439,17 @@ def _check_for_long_estimated_runtime(self, groups):
         """
         unique_groups = len(set(groups))
         warning = (
-            "Finding the optimal split points"
-            " with {0} unique groups and n_splits at {1}"
+            f"Finding the optimal split points with {unique_groups} unique groups and n_splits at {self.n_splits}"
             " can take several minutes."
-        ).format(unique_groups, self.n_splits)
+        )
         if self.n_splits == 4 and unique_groups > 250:
-            warn(
-                warning + " Consider to decrease n_splits to 3 or lower.",
-                UserWarning,
-            )
+            warn(warning + " Consider to decrease n_splits to 3 or lower.", UserWarning)
 
         elif self.n_splits == 5 and unique_groups > 100:
-            warn(
-                warning + " Consider to decrease n_splits to 4 or lower.",
-                UserWarning,
-            )
+            warn(warning + " Consider to decrease n_splits to 4 or lower.", UserWarning)
 
         elif self.n_splits > 5 and unique_groups > 30:
-            warn(
-                warning + " Consider to decrease n_splits to 5 or lower.",
-                UserWarning,
-            )
+            warn(warning + " Consider to decrease n_splits to 5 or lower.", UserWarning)
 
     def _iter_test_indices(self, X=None, y=None, groups=None):
         """Calculate the optimal division of groups into folds so that every fold is as equally large as possible.
@@ -485,8 +472,10 @@ def _iter_test_indices(self, X=None, y=None, groups=None):
         self._first_split_index, self._last_split_index = self._calc_first_and_last_split_index(groups=groups)
         self._best_splits = self._get_split_indices()
         groups = self._regroup(groups)
+        group_indices = tuple(np.where(groups == i)[0] for i in range(self.n_splits + 1))
+
         for i in range(self.n_splits):
-            yield np.where(groups == i)[0], np.where(groups == i + 1)[0]
+            yield group_indices[i], group_indices[i + 1]
 
     def _calc_first_and_last_split_index(self, X=None, y=None, groups=None):
         """Calculate an approximate first and last split point to reduce the amount of options during a brute force
@@ -508,42 +497,17 @@ def _calc_first_and_last_split_index(self, X=None, y=None, groups=None):
         """
 
         # get the counts (=amount of rows) for each group
-        self._grouped_df = (
-            pd.DataFrame(np.array(groups))
-            .rename(columns={0: "index"})
-            .groupby("index")
-            .size()
-            .sort_index()
-            .to_frame()
-            .rename(columns={0: "observations"})
-        )
+        idx, obs_per_grp = np.unique(groups, return_counts=True)
+        order = np.argsort(idx)
+        self._index, self._obs_per_grp = idx[order], obs_per_grp[order]
 
-        # set the ideal group_size and reduce it to 90% to have some leverage
-        self._ideal_group_size = np.sum(self._grouped_df["observations"]) / (self.n_splits + 1)
+        self._ideal_group_size = np.sum(self._obs_per_grp) / (self.n_splits + 1)
         init_ideal_group_size = self._ideal_group_size * 0.9
 
-        # initialize the index of the first split, to reduce the amount of possible index split options
-        first_split_index = (
-            self._grouped_df.assign(cumsum_obs=lambda df: df["observations"].cumsum())
-            .assign(group_id=lambda df: (df["cumsum_obs"] - 1) // init_ideal_group_size)
-            .reset_index()
-            .loc[lambda df: df["group_id"] != 0]
-            .iloc[0]
-            .name
-        )
-        # initialize the index of the last split point, to reduce the amount of possible index split options
-        last_split_index = len(self._grouped_df) - (
-            self._grouped_df.assign(
-                observations=lambda df: df["observations"].values[::-1],
-                cumsum_obs=lambda df: df["observations"].cumsum(),
-            )
-            .reset_index()
-            .assign(group_id=lambda df: (df["cumsum_obs"] - 1) // init_ideal_group_size)
-            .loc[lambda df: df["group_id"] != 0]
-            .iloc[0]
-            .name
-            - 1
-        )
+        cumsum_obs = np.cumsum(self._obs_per_grp)
+        first_split_index = np.where(cumsum_obs > init_ideal_group_size)[0][0]
+        last_split_index = len(self._obs_per_grp) - np.where(cumsum_obs[::-1] > init_ideal_group_size)[0][0] - 1
+
         return first_split_index, last_split_index
 
     def _get_split_indices(self):
@@ -558,7 +522,7 @@ def _get_split_indices(self):
         # set the index range to search possible splits for
         index_range = range(self._first_split_index, self._last_split_index)
 
-        observations = self._grouped_df["observations"].tolist()
+        observations = self._obs_per_grp
 
         # create generator with all the possible index splits
         # e.g. for [0, 1, 3, 5, 8] and self.n_splits = 2
@@ -579,22 +543,22 @@ def _get_split_indices(self):
         # ideal_group_size = 100
         # group_sizes = [10,20,270]
         # diff_from_ideal_list = [-90, -80, 170]
-        diff_from_ideal_list = [sum(observations[: first_splits[0]]) - self._ideal_group_size]
+        diff_from_ideal_list = [np.sum(observations[: first_splits[0]]) - self._ideal_group_size]
         for split in sliding_window(first_splits, window_size=2, step_size=1):
             try:
-                diff_from_ideal_list += [sum(observations[split[0] : split[1]]) - self._ideal_group_size]
+                diff_from_ideal_list += [np.sum(observations[split[0] : split[1]]) - self._ideal_group_size]
             except IndexError:
-                diff_from_ideal_list += [sum(observations[split[0] :]) - self._ideal_group_size]
+                diff_from_ideal_list += [np.sum(observations[split[0] :]) - self._ideal_group_size]
 
         # keep track of the minimum of the total difference from all groups to the ideal group size
-        min_diff = sum([abs(diff) for diff in diff_from_ideal_list])
+        min_diff = np.sum([np.abs(diff) for diff in diff_from_ideal_list])
         best_splits = first_splits
 
         # loop through all possible split points and check whether a new split
         # has a less total difference from all groups to the ideal group size
         for prev_splits, new_splits in zip(splits_generator, splits_generator_shifted):
             diff_from_ideal_list = self._calc_new_diffs(observations, diff_from_ideal_list, prev_splits, new_splits)
-            new_diff = sum([abs(diff) for diff in diff_from_ideal_list])
+            new_diff = np.sum([np.abs(diff) for diff in diff_from_ideal_list])
 
             # if with the new split the difference is less than the current most optimal, save the new split
             if new_diff < min_diff:
@@ -639,7 +603,7 @@ def _calc_new_diffs(values, diff_list, prev_splits, new_splits):
                 )
 
                 # calculate the value change from one group to another
-                value_change = sum(values[start_index:end_index])
+                value_change = np.sum(values[start_index:end_index])
 
                 # if diff < 0 the previous group gains values, so change value_change to -value_change
                 value_change = value_change if diff > 0 else -value_change
@@ -664,16 +628,17 @@ def _regroup(self, groups):
             Indices for the train and test splits of each fold
         """
 
-        df = self._grouped_df.copy().reset_index()
+        group = self._obs_per_grp.copy()
         # set each unique group to the right group_id to group them into folds
-        df.loc[: self._best_splits[0], "group"] = 0
+
+        group[: self._best_splits[0] + 1] = 0
+
         for group_id, splits in enumerate(sliding_window(self._best_splits, 2, 1)):
             try:
-                df.loc[splits[0] : splits[1], "group"] = group_id + 1
+                group[splits[0] + 1 : splits[1] + 1] = group_id + 1
             except IndexError:
-                df.loc[splits[0] :, "group"] = group_id + 1
-
-        self._grouped_df = df
+                group[splits[0] + 1 :] = group_id + 1
+        self._group = group
         # create a mapper to set every group to the right group_id
-        mapper = dict(zip(df["index"], df["group"]))
+        mapper = dict(zip(self._index, self._group))
         return np.vectorize(mapper.get)(groups)