From d47ce88e2f118abe3c823c1084f26ac0d763f21d Mon Sep 17 00:00:00 2001 From: "Uyanik, Yusuf" Date: Thu, 28 Sep 2023 11:21:14 +0200 Subject: [PATCH 1/2] polars version upgrade --- examples/articles/ADMExplained.ipynb | 8 ++-- examples/articles/pdstoolsv3.ipynb | 2 +- examples/articles/thompsonsampling.ipynb | 6 +-- python/pdstools/adm/ADMDatamart.py | 18 ++++---- python/pdstools/adm/ADMTrees.py | 6 +-- python/pdstools/adm/Tables.py | 20 ++++----- python/pdstools/ih/IHAnalysis.py | 2 +- python/pdstools/ih/legacy_IH.py | 24 +++++------ python/pdstools/plots/plot_base.py | 44 ++++++++++---------- python/pdstools/plots/plots_plotly.py | 2 +- python/pdstools/reports/HealthCheck.qmd | 24 +++++------ python/pdstools/reports/HealthCheckModel.qmd | 18 ++++---- python/pdstools/utils/cdh_utils.py | 8 ++-- python/pdstools/utils/streamlit_utils.py | 2 +- python/pdstools/valuefinder/ValueFinder.py | 10 ++--- python/requirements.txt | 2 +- python/tests/test_cdh_utils.py | 4 +- 17 files changed, 100 insertions(+), 100 deletions(-) diff --git a/examples/articles/ADMExplained.ipynb b/examples/articles/ADMExplained.ipynb index a567dd70..45413acf 100644 --- a/examples/articles/ADMExplained.ipynb +++ b/examples/articles/ADMExplained.ipynb @@ -137,7 +137,7 @@ " model_id is None\n", "):\n", " display(\n", - " model.groupby(\"ModelID\")\n", + " model.group_by(\"ModelID\")\n", " .agg(\n", " number_of_predictors=pl.col(\"PredictorName\").n_unique(),\n", " model_performance=cdh_utils.weighed_performance_polars() * 100,\n", @@ -251,7 +251,7 @@ }, "outputs": [], "source": [ - "display(predictorbinning.groupby(\"PredictorName\").agg(\n", + "display(predictorbinning.group_by(\"PredictorName\").agg(\n", " pl.first(\"ResponseCount\").cast(pl.Int64).alias(\"# Responses\"),\n", " pl.n_unique(\"BinIndex\").alias(\"# Bins\"),\n", " (pl.first(\"PerformanceBin\") * 100).alias(\"Predictor Performance(AUC)\"),\n", @@ -665,7 +665,7 @@ "\n", "df = (\n", " modelpredictors.filter(pl.col(\"PredictorName\") != \"Classifier\")\n", - " .groupby(\"PredictorName\")\n", + " .group_by(\"PredictorName\")\n", " .agg(\n", " Value=pl.when(pl.col(\"Type\").first() == \"numeric\")\n", " .then(\n", @@ -873,7 +873,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.11.4" }, "orig_nbformat": 4 }, diff --git a/examples/articles/pdstoolsv3.ipynb b/examples/articles/pdstoolsv3.ipynb index 7ebb862f..3b2cd05d 100644 --- a/examples/articles/pdstoolsv3.ipynb +++ b/examples/articles/pdstoolsv3.ipynb @@ -404,7 +404,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.11.4" }, "orig_nbformat": 4 }, diff --git a/examples/articles/thompsonsampling.ipynb b/examples/articles/thompsonsampling.ipynb index de011f20..4a526a5c 100644 --- a/examples/articles/thompsonsampling.ipynb +++ b/examples/articles/thompsonsampling.ipynb @@ -80,7 +80,7 @@ "# Convergence of the Thompson Sampled propensities\n", "s = thompsonSamplingSimulation['positives']\n", "thompsonSamplingSimulation2 = thompsonSamplingSimulation.hstack(s.cut(breaks=np.array(range(int(s.min()), int(s.max())+20, 20))-1, series=False).select(bin='category'))\n", - "s = thompsonSamplingSimulation2.groupby(\"p\", \"bin\").agg(\n", + "s = thompsonSamplingSimulation2.group_by(\"p\", \"bin\").agg(\n", " n=pl.count(),\n", " n90=(((pl.col(\"sampled_propensity\") - pl.col(\"p\")) / pl.col(\"p\")) < 0.1).sum(),\n", " positives=pl.min(\"positives\"),\n", @@ -98,7 +98,7 @@ ").explode('sampled_propensity').with_columns(positives = pl.col('evidence')*pl.col('p'))\n", "from scipy.stats import gaussian_kde\n", "results = {}\n", - "for p, series in settings1.groupby('p'):\n", + "for p, series in settings1.group_by('p'):\n", " results[str(p)] = gaussian_kde(series['sampled_propensity'], 'silverman')(np.arange(0,0.15,0.0001))\n", "results = pl.DataFrame(results).with_columns(sampledPropensity=pl.Series(np.arange(0,0.15,0.0001))).to_pandas().set_index('sampledPropensity')\n", "DistributionOfSampled = px.area(results, title='Distribution of the sampled propensities
for a few combinations of model propensity and evidence', template='none', labels={'value':'', 'sampledPropensity':'Sampled Propensity', 'variable':'Propensity'}).update_yaxes({'visible':True}).update_xaxes({'tickformat':',.0%', 'tickmode':'array', 'tickvals':[0, 0.01, 0.05, 0.1]}).update_layout(showlegend=False).update_traces({'line':{'width':0.0}})#.add_annotation()\n", @@ -456,7 +456,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.11.4" }, "orig_nbformat": 4 }, diff --git a/python/pdstools/adm/ADMDatamart.py b/python/pdstools/adm/ADMDatamart.py index 2b8d58fd..fe39ca90 100644 --- a/python/pdstools/adm/ADMDatamart.py +++ b/python/pdstools/adm/ADMDatamart.py @@ -751,7 +751,7 @@ def discover_modelTypes( ) -> Dict: # pragma: no cover """Discovers the type of model embedded in the pyModelData column. - By default, we do a groupby Configuration, because a model rule can only + By default, we do a group_by Configuration, because a model rule can only contain one type of model. Then, for each configuration, we look into the pyModelData blob and find the _serialClass, returning it in a dict. @@ -791,7 +791,7 @@ def _getType(val): types = ( df.filter(pl.col("Modeldata").is_not_null()) - .groupby(by) + .group_by(by) .agg(pl.col("Modeldata").last()) .collect() .with_columns(pl.col("Modeldata").apply(lambda v: _getType(v))) @@ -907,7 +907,7 @@ def _create_sign_df( .alias("Daily_increase") .over("ModelID") ) - .groupby_dynamic("SnapshotTime", every=every, by=by) + .group_by_dynamic("SnapshotTime", every=every, by=by) .agg(pl.sum("Daily_increase").alias("Increase")) ) if pivot: @@ -947,7 +947,7 @@ def model_summary( Returns ------- pl.LazyFrame: - Groupby dataframe over all models + group_by dataframe over all models """ df = self._apply_query(self.modelData, query) data = self.last(df, strategy="lazy").lazy() @@ -959,7 +959,7 @@ def model_summary( assert required_columns.issubset(set(data.columns) | set(context_keys)) return ( - data.groupby(context_keys) + data.group_by(context_keys) .agg( [ pl.count(by).suffix("_count"), @@ -1027,7 +1027,7 @@ def pivot_df( if top_n > 0: top_n_xaxis = ( df.unique(subset=[by], keep="first") - .groupby(by) + .group_by(by) .agg( cdh_utils.weighed_average_polars("PerformanceBin", "ResponseCount") ) @@ -1037,7 +1037,7 @@ def pivot_df( ) df = top_n_xaxis.join(df, on=by, how="left") if by not in ["ModelID", "Name"]: - df = df.groupby([by, "PredictorName"]).agg( + df = df.group_by([by, "PredictorName"]).agg( cdh_utils.weighed_average_polars("PerformanceBin", "ResponseCount") ) df = ( @@ -1077,7 +1077,7 @@ def response_gain_df(df: any_frame, by: str = "Channel") -> any_frame: if isinstance(by, list): by = by[0] return ( - df.groupby([by, "ModelID"]) + df.group_by([by, "ModelID"]) .agg(pl.max("ResponseCount")) .sort([by, "ResponseCount"], descending=True) .with_columns( @@ -1129,7 +1129,7 @@ def models_by_positives_df( how="left", ) .lazy() - .groupby([by, "PositivesBin", "break_point"]) + .group_by([by, "PositivesBin", "break_point"]) .agg([pl.min("Positives"), pl.n_unique("ModelID").alias("ModelCount")]) .with_columns( (pl.col("ModelCount") / (pl.sum("ModelCount").over(by))).alias( diff --git a/python/pdstools/adm/ADMTrees.py b/python/pdstools/adm/ADMTrees.py index 0d99cade..a816f864 100644 --- a/python/pdstools/adm/ADMTrees.py +++ b/python/pdstools/adm/ADMTrees.py @@ -484,7 +484,7 @@ def getGroupedGainsPerSplit(self) -> pl.DataFrame: the mean gains, and the number of times the split is performed. """ return ( - self.gainsPerSplit.groupby("split", maintain_order=True) + self.gainsPerSplit.group_by("split", maintain_order=True) .agg( [ pl.first("predictor"), @@ -548,7 +548,7 @@ def plotSplitsPerVariable(self, subset: Optional[Set] = None, show=True): plt.figure """ figlist = [] - for name, data in self.gainsPerSplit.groupby("predictor"): + for name, data in self.gainsPerSplit.group_by("predictor"): if (subset is not None and name in subset) or subset is None: fig = make_subplots() fig.add_trace( @@ -613,7 +613,7 @@ def getTreeStats(self) -> pl.DataFrame: def getAllValuesPerSplit(self) -> Dict: """Generate a dictionary with the possible values for each split""" splitvalues = {} - for name, group in self.groupedGainsPerSplit.groupby("predictor"): + for name, group in self.groupedGainsPerSplit.group_by("predictor"): if name not in splitvalues.keys(): splitvalues[name] = set() splitvalue = group.get_column("values").to_list() diff --git a/python/pdstools/adm/Tables.py b/python/pdstools/adm/Tables.py index b11d0eed..d268e20e 100644 --- a/python/pdstools/adm/Tables.py +++ b/python/pdstools/adm/Tables.py @@ -36,9 +36,9 @@ def _by(self): return [ col for col in columns - if col in self.modelData.columns - and self.modelData.schema[col] != pl.Null + if col in self.modelData.columns and self.modelData.schema[col] != pl.Null ] + @property def AvailableTables(self): df = pl.DataFrame( @@ -56,7 +56,7 @@ def AvailableTables(self): df = df.transpose().with_columns(pl.Series(df.columns)) df.columns = ["modelData", "predictorData", "Tables"] return df.select(["Tables", "modelData", "predictorData"]) - + @property def ApplicableTables(self): df = self.AvailableTables @@ -70,7 +70,7 @@ def ApplicableTables(self): def model_overview(self): return ( self.last(strategy="lazy") - .groupby(["Configuration", "Channel", "Direction"]) + .group_by(["Configuration", "Channel", "Direction"]) .agg( [ pl.col("Name").unique().count().alias("Number of Actions"), @@ -93,7 +93,7 @@ def model_overview(self): @cached_property def predictors_per_configuration(self): return ( - self.combinedData.groupby("Configuration") + self.combinedData.group_by("Configuration") .agg( [ pl.col("PredictorName").unique().count().alias("Predictor Count"), @@ -108,7 +108,7 @@ def predictors_per_configuration(self): def bad_predictors(self): return ( self.predictorData.filter(pl.col("PredictorName") != "Classifier") - .groupby("PredictorName") + .group_by("PredictorName") .agg( [ pl.sum("ResponseCount").alias("Response Count"), @@ -123,7 +123,7 @@ def bad_predictors(self): @property def _zero_response(self): - return self.modelData.groupby(self._by).agg( + return self.modelData.group_by(self._by).agg( [pl.sum("ResponseCount"), pl.sum("Positives"), pl.mean("Performance")] ) @@ -139,14 +139,14 @@ def zero_positives(self): def _last_counts(self): return ( self.last(strategy="lazy") - .groupby(self._by) + .group_by(self._by) .agg([pl.sum("ResponseCount"), pl.sum("Positives"), pl.mean("Performance")]) ) @cached_property def reach(self): def calc_reach(x=pl.col("Positives")): - return 0.02 + 0.98 * (pl.min([pl.lit(200), x]) / 200) + return 0.02 + 0.98 * (pl.min_horizontal([pl.lit(200), x]) / 200) return ( self._last_counts.filter( @@ -165,7 +165,7 @@ def minimum_performance(self): @cached_property def appendix(self): return ( - self.modelData.groupby(self._by + ["ModelID"]) + self.modelData.group_by(self._by + ["ModelID"]) .agg( [ pl.max("ResponseCount").alias("Responses"), diff --git a/python/pdstools/ih/IHAnalysis.py b/python/pdstools/ih/IHAnalysis.py index 2cffe26e..c5862915 100644 --- a/python/pdstools/ih/IHAnalysis.py +++ b/python/pdstools/ih/IHAnalysis.py @@ -19,7 +19,7 @@ def _metricPerPeriod( df = ( df.sort(OutcomeTime_col) - .groupby_dynamic(OutcomeTime_col, every=period, by=by) + .group_by_dynamic(OutcomeTime_col, every=period, by=by) .agg(metrics) ) if isinstance(df, pl.LazyFrame): diff --git a/python/pdstools/ih/legacy_IH.py b/python/pdstools/ih/legacy_IH.py index c6ef1b06..e857f12f 100644 --- a/python/pdstools/ih/legacy_IH.py +++ b/python/pdstools/ih/legacy_IH.py @@ -48,7 +48,7 @@ def get_total_outcome(df, outcome, rollup): # pragma: no cover for i in outcome: _df = ( df[df["pyOutcome"] == i] - .groupby(rollup) + .group_by(rollup) .count()[["pxInteractionID"]] .rename(columns={"pxInteractionID": "Count: " + i}) ) @@ -56,7 +56,7 @@ def get_total_outcome(df, outcome, rollup): # pragma: no cover else: _df_all = ( df[df["pyOutcome"] == outcome] - .groupby(rollup) + .group_by(rollup) .count()[["pxInteractionID"]] .rename(columns={"pxInteractionID": "Count: " + outcome}) ) @@ -75,14 +75,14 @@ def get_accept_rate(df, pos, neg, rollup): _df = ( df[df["pyOutcome"].isin(total)] - .groupby(rollup) + .group_by(rollup) .count()[["pxInteractionID"]] .reset_index() .rename(columns={"pxInteractionID": "Total"}) ) _df = _df.merge( df[df["pyOutcome"].isin(pos)] - .groupby(rollup) + .group_by(rollup) .count()[["pxInteractionID"]] .reset_index() .rename(columns={"pxInteractionID": "Accepted"}), @@ -134,8 +134,8 @@ def plot_daily_cumulative_accept_rate(df, pos, neg, **kwargs): _df, rollup, hue = get_accept_rate_time(df, pos, neg, "Date", **kwargs) if "hue" in kwargs.keys(): - _df["Total_cum"] = _df.groupby(hue)["Total"].apply(lambda x: x.cumsum()) - _df["Accepted_cum"] = _df.groupby(hue)["Accepted"].apply(lambda x: x.cumsum()) + _df["Total_cum"] = _df.group_by(hue)["Total"].apply(lambda x: x.cumsum()) + _df["Accepted_cum"] = _df.group_by(hue)["Accepted"].apply(lambda x: x.cumsum()) _df["hue"] = _df[hue].agg("__".join, axis=1) kwargs["hue"] = "hue" else: @@ -221,7 +221,7 @@ def plot_outcome_count_time(df, outcome, time, **kwargs): else: rollup.append(kwargs["hue"]) hue.append(kwargs["hue"]) - _df = _df.groupby(rollup).count().reset_index() + _df = _df.group_by(rollup).count().reset_index() if len(hue) > 0: _df["hue"] = _df[hue].agg("__".join, axis=1) kwargs["hue"] = "hue" @@ -261,7 +261,7 @@ def get_allDays_df(_df, inds_df, hue): def get_total_outcome_share_per_level(df, outcome, level): _df = ( df[df["pyOutcome"] == outcome] - .groupby(level) + .group_by(level) .count()[["pxInteractionID"]] .rename(columns={"pxInteractionID": "Count"}) .reset_index() @@ -298,7 +298,7 @@ def get_outcome_share_time(df, outcome, level, time="daily"): _df = df[df["pyOutcome"] == outcome].reset_index(drop=True) outcome_per_gra = ( - _df.groupby([gra]) + _df.group_by([gra]) .count()[["pxInteractionID"]] .rename(columns={"pxInteractionID": "total " + time + " " + outcome}) .reset_index() @@ -309,7 +309,7 @@ def get_outcome_share_time(df, outcome, level, time="daily"): ).rename(columns={"newCol": level}) level_outcome_share_gra = ( - _df.groupby([level, gra]) + _df.group_by([level, gra]) .count()[["pxInteractionID"]] .rename(columns={"pxInteractionID": level + " " + outcome + " Count"}) .reset_index() @@ -368,14 +368,14 @@ def get_delta_df(df, outcome, level, dates): total_range_outcomes = ( share_delta[["Date", "Date Range", "total daily " + outcome]] .drop_duplicates() - .groupby("Date Range") + .group_by("Date Range") .sum() .reset_index() .rename(columns={"total daily " + outcome: "total range " + outcome}) ) share_delta = ( share_delta.drop("total daily " + outcome, axis=1) - .groupby([level, "Date Range"]) + .group_by([level, "Date Range"]) .sum() .reset_index() ) diff --git a/python/pdstools/plots/plot_base.py b/python/pdstools/plots/plot_base.py index 01e7c604..d5844253 100644 --- a/python/pdstools/plots/plot_base.py +++ b/python/pdstools/plots/plot_base.py @@ -113,12 +113,12 @@ def top_n( if top_n < 1: return df - if facets: df = df.join( - df.groupby(facets + ["PredictorName"]) + df.group_by(facets + ["PredictorName"]) .agg(weighed_average_polars(to_plot, "ResponseCountBin")) - .groupby(*facets) + .filter(pl.col(to_plot).is_not_nan()) + .group_by(*facets) .agg( pl.col("PredictorName") .sort_by(to_plot, descending=True) @@ -130,11 +130,11 @@ def top_n( else: df = df.join( - df.filter(pl.col("PredictorName").cast(pl.Utf8) != "Classifier") - .groupby("PredictorName") + df.group_by("PredictorName") .agg(weighed_average_polars(to_plot, "ResponseCountBin")) - .sort(to_plot) - .tail(top_n) + .filter(pl.col(to_plot).is_not_nan()) + .sort(to_plot, descending=True) + .head(top_n) .select("PredictorName"), on="PredictorName", ) @@ -310,7 +310,7 @@ def facettedPlot( figlist.append(plotFunc(facet=facet, *args, **kwargs)) else: order = kwargs.pop("order", None) - for facet_val, groupdf in kwargs.pop("df").groupby(*facets): + for facet_val, groupdf in kwargs.pop("df").group_by(*facets): figlist.append( plotFunc( df=groupdf, @@ -476,12 +476,12 @@ def plotOverTime( ) df = df.sort(by="SnapshotTime") - groupby = [by] + group_by = [by] if len(facets) > 0 and facets[0] is not None: - groupby = groupby + facets + group_by = group_by + facets if metric in ["Performance", "weighted_performance", "SuccessRate"]: df = ( - df.groupby_dynamic("SnapshotTime", every=every, by=groupby) + df.group_by_dynamic("SnapshotTime", every=every, by=group_by) .agg( [ weighed_average_polars("SuccessRate", "ResponseCount").alias( @@ -495,10 +495,10 @@ def plotOverTime( else: if mode == "diff": df = self._create_sign_df( - df, by=groupby, what=metric, every=every, mask=False, pivot=False + df, by=group_by, what=metric, every=every, mask=False, pivot=False ) elif mode == "Cumulative": - df = df.groupby(groupby + ["SnapshotTime"]).agg(pl.sum(metric)) + df = df.group_by(group_by + ["SnapshotTime"]).agg(pl.sum(metric)) if metric == "Performance": metric = "weighted_performance" @@ -580,10 +580,10 @@ def plotPropositionSuccessRates( top_n_by = by if facets == [None] else facets + [by] if top_n > 0: # TODO: fix. df = df.join( - df.groupby(facets) + df.group_by(facets) .agg(pl.mean(metric)) .sort(metric) - .tail(top_n) + .head(top_n, descending=True) .select(facets), on=facets, ) @@ -847,7 +847,7 @@ def plotPredictorPerformance( order = {} for facet, group_df in ( - df.groupby(*facets, "PredictorName") + df.group_by(*facets, "PredictorName") .agg( pl.median(to_plot).alias(f"median_{to_plot}"), ) @@ -861,7 +861,7 @@ def plotPredictorPerformance( partition = None df = self.top_n(df, top_n, to_plot) order = ( - df.groupby("PredictorName") + df.group_by("PredictorName") .agg( pl.median(to_plot).alias(f"median_{to_plot}"), ) @@ -961,7 +961,7 @@ def plotPredictorCategoryPerformance( df = df.filter(pl.col("PredictorName").cast(pl.Utf8) != "Classifier") df = ( - df.groupby(facets + ["ModelID", "PredictorCategory"]) + df.group_by(facets + ["ModelID", "PredictorCategory"]) .agg( weighed_average_polars("PerformanceBin", "ResponseCountBin").alias( "PerformanceBin" @@ -1060,7 +1060,7 @@ def plotPredictorContribution( df = ( df.filter(pl.col("PredictorName") != "Classifier") .with_columns((pl.col("PerformanceBin") - 0.5) * 2) - .groupby(by, "PredictorCategory") + .group_by(by, "PredictorCategory") .agg( Performance=weighed_average_polars("PerformanceBin", "BinResponseCount") ) @@ -1437,7 +1437,7 @@ def plotTreeMap( else: color_var = color_var.lower() color = kwargs.pop("color_col", defaults[color_var][0]) - values = kwargs.pop("groupby_col", defaults[color_var][1]) + values = kwargs.pop("group_by_col", defaults[color_var][1]) title = kwargs.pop("title", defaults[color_var][2]) reverse_scale = kwargs.pop("reverse_scale", defaults[color_var][3]) log = kwargs.pop("log", defaults[color_var][4]) @@ -1482,12 +1482,12 @@ def plotPredictorCount( df = ( df.filter(pl.col("PredictorName") != "Classifier") - .groupby(pl.all().exclude("PredictorName")) + .group_by(pl.all().exclude("PredictorName")) .agg(pl.n_unique("PredictorName").alias("Predictor Count")) ) overall = ( - df.groupby(pl.all().exclude(["PredictorName", "Type", "Predictor Count"])) + df.group_by(pl.all().exclude(["PredictorName", "Type", "Predictor Count"])) .agg(pl.sum("Predictor Count")) .with_columns(pl.lit("Overall").alias("Type")) ) diff --git a/python/pdstools/plots/plots_plotly.py b/python/pdstools/plots/plots_plotly.py index c68ba681..d603f299 100644 --- a/python/pdstools/plots/plots_plotly.py +++ b/python/pdstools/plots/plots_plotly.py @@ -269,7 +269,7 @@ def PropositionSuccessRates( if show_error: errors = { i[0]: i[1] - for i in df.groupby(by, maintain_order=True) + for i in df.group_by(by, maintain_order=True) .agg(pl.std("SuccessRate").fill_nan(0)) .iter_rows() } diff --git a/python/pdstools/reports/HealthCheck.qmd b/python/pdstools/reports/HealthCheck.qmd index 487e7604..bc0c7730 100644 --- a/python/pdstools/reports/HealthCheck.qmd +++ b/python/pdstools/reports/HealthCheck.qmd @@ -137,9 +137,9 @@ The standard Pega Next Best Action Designer framework defines a number of standa channel_overwiev_columns = [ col for col in ["Channel", "Direction"] if col in datamart_all_columns ] -channel_overview = last_data.groupby(["Configuration", "ModelID"] + channel_overwiev_columns).agg( +channel_overview = last_data.group_by(["Configuration", "ModelID"] + channel_overwiev_columns).agg( pl.max("ResponseCount"), pl.max("Positives") -).groupby(channel_overwiev_columns).agg( +).group_by(channel_overwiev_columns).agg( pl.sum("ResponseCount"), pl.sum("Positives"), pl.col("Configuration").unique().alias("Supported by Configurations"), @@ -194,7 +194,7 @@ df = ( .select(hover_columns + ["ModelID", "Channel/Direction","SuccessRate"]) .with_columns(pl.col("SuccessRate").round(4)) .sort(["Channel/Direction", "SuccessRate"], descending=True) - .groupby(["Channel/Direction"]) + .group_by(["Channel/Direction"]) .head(20) .collect() ).to_pandas(use_pyarrow_extension_array=True) @@ -251,7 +251,7 @@ Interactive chart with all success rates. ```{python} levels = [col for col in ["Configuration",'Channel', 'Direction', 'Issue', 'Group', "Name", "Treatment"] if col in datamart_all_columns] fig = datamart.plotTreeMap(color_var="SuccessRate", - groupby_col=None, + group_by_col=None, levels=levels, colorscale=pega_template.success, query=pl.col("ResponseCount")>100,) @@ -363,7 +363,7 @@ Using an interactive treemap to visualize the performance. Lighter is better, da It can be interesting to see which issues, groups or channels can be better predicted than others. Identifying categories of items for which the predictions are poor can help to drive the search for better predictors, for example. ```{python} -fig = datamart.plotTreeMap(color_var="performance_weighted", groupby_col= None, levels=levels) +fig = datamart.plotTreeMap(color_var="performance_weighted", group_by_col= None, levels=levels) fig.show() ``` @@ -577,7 +577,7 @@ path = [col for col in ["Configuration", "PredictorCategory", "PredictorName"] gb_cols = path path = [px.Constant("All Models")] + path -missing = datamart.last(table = "combinedData").filter(pl.col("PredictorName") != "Classifier").groupby(gb_cols).agg( +missing = datamart.last(table = "combinedData").filter(pl.col("PredictorName") != "Classifier").group_by(gb_cols).agg( pl.col("BinResponseCount") .where(pl.col("BinSymbol") == "MISSING") .sum() @@ -697,7 +697,7 @@ df = ( datamart.modelData .with_columns(pl.col(pl.Categorical).cast(pl.Utf8)) .with_columns(pl.col(pl.Utf8).fill_null("Missing")) - .groupby(by) + .group_by(by) .agg( [ ((pl.col("Positives") > 0) & (pl.col("Positives") < 100 )).sum().alias("immature_count"), @@ -787,7 +787,7 @@ for split_facet in facet.split("/"): pl.col(split_facet).cast(pl.Utf8).fill_null("NA") ) last_data = last_data.with_columns(pl.concat_str(facet.split("/"), separator="/").alias(facet)) -response_counts = last_data.groupby([facet] + facet.split("/")).agg( +response_counts = last_data.group_by([facet] + facet.split("/")).agg( [ pl.sum("ResponseCount").alias("all_responses"), pl.sum("Positives").alias("positive_sum") @@ -839,7 +839,7 @@ A lot of volume on the first bins, where the performance is minimal, means that to_plot = "Performance" df = ( datamart.modelData.with_columns(pl.col(to_plot) * 100) - .groupby([to_plot, "Channel", "Direction"]) + .group_by([to_plot, "Channel", "Direction"]) .agg(pl.sum("ResponseCount")) .with_columns(pl.col(to_plot).round(2)) .collect() @@ -848,7 +848,7 @@ df = ( breaks = [percentile for percentile in range(50, 100, 3)] df = df.with_columns(pl.col("Performance").cut(breaks=breaks).alias("PerformanceBin")) -grouped = df.groupby(["Channel", "PerformanceBin"]).agg( +grouped = df.group_by(["Channel", "PerformanceBin"]).agg( pl.sum("ResponseCount"), pl.min(to_plot).alias("break_label") ) out = ( @@ -919,7 +919,7 @@ if to_plot == "Propensity" and to_plot not in datamart.predictorData.columns: to_plot = "BinPropensity" df = ( datamart.combinedData.filter(pl.col("PredictorName") != "Classifier") - .groupby([to_plot, "Channel", "Direction"]) + .group_by([to_plot, "Channel", "Direction"]) .agg(pl.sum("BinResponseCount")) .with_columns(pl.col(to_plot).round(4).cast(pl.Float64)) .collect() @@ -954,7 +954,7 @@ df_pl = df.with_columns( .alias(f"{to_plot}_range") ) -grouped = df_pl.groupby(["Channel", f"{to_plot}_range"]).agg( +grouped = df_pl.group_by(["Channel", f"{to_plot}_range"]).agg( pl.sum("BinResponseCount"), pl.min(to_plot).alias("break_label") ) diff --git a/python/pdstools/reports/HealthCheckModel.qmd b/python/pdstools/reports/HealthCheckModel.qmd index 77dcff41..12de5ef0 100644 --- a/python/pdstools/reports/HealthCheckModel.qmd +++ b/python/pdstools/reports/HealthCheckModel.qmd @@ -138,9 +138,9 @@ The standard Pega Next Best Action Designer framework defines a number of standa channel_overwiev_columns = [ col for col in ["Channel", "Direction"] if col in datamart_all_columns ] -channel_overview = last_data.groupby(["Configuration", "ModelID"] + channel_overwiev_columns).agg( +channel_overview = last_data.group_by(["Configuration", "ModelID"] + channel_overwiev_columns).agg( pl.max("ResponseCount"), pl.max("Positives") -).groupby(channel_overwiev_columns).agg( +).group_by(channel_overwiev_columns).agg( pl.sum("ResponseCount"), pl.sum("Positives"), pl.col("Configuration").unique().alias("Supported by Configurations"), @@ -195,7 +195,7 @@ df = ( .select(hover_columns + ["ModelID", "Channel/Direction","SuccessRate"]) .with_columns(pl.col("SuccessRate").round(4)) .sort(["Channel/Direction", "SuccessRate"], descending=True) - .groupby(["Channel/Direction"]) + .group_by(["Channel/Direction"]) .head(20) .collect() ).to_pandas(use_pyarrow_extension_array=True) @@ -252,7 +252,7 @@ Interactive chart with all success rates. ```{python} levels = [col for col in ["Configuration",'Channel', 'Direction', 'Issue', 'Group', "Name", "Treatment"] if col in datamart_all_columns] fig = datamart.plotTreeMap(color_var="SuccessRate", - groupby_col=None, + group_by_col=None, levels=levels, colorscale=pega_template.success, query=pl.col("ResponseCount")>100,) @@ -364,7 +364,7 @@ Using an interactive treemap to visualize the performance. Lighter is better, da It can be interesting to see which issues, groups or channels can be better predicted than others. Identifying categories of items for which the predictions are poor can help to drive the search for better predictors, for example. ```{python} -fig = datamart.plotTreeMap(color_var="performance_weighted", groupby_col= None, levels=levels) +fig = datamart.plotTreeMap(color_var="performance_weighted", group_by_col= None, levels=levels) fig.show() ``` @@ -469,7 +469,7 @@ df = ( datamart.modelData .with_columns(pl.col(pl.Categorical).cast(pl.Utf8)) .with_columns(pl.col(pl.Utf8).fill_null("Missing")) - .groupby(by) + .group_by(by) .agg( [ ((pl.col("Positives") > 0) & (pl.col("Positives") < 100 )).sum().alias("immature_count"), @@ -559,7 +559,7 @@ for split_facet in facet.split("/"): pl.col(split_facet).cast(pl.Utf8).fill_null("NA") ) last_data = last_data.with_columns(pl.concat_str(facet.split("/"), separator="/").alias(facet)) -response_counts = last_data.groupby([facet] + facet.split("/")).agg( +response_counts = last_data.group_by([facet] + facet.split("/")).agg( [ pl.sum("ResponseCount").alias("all_responses"), pl.sum("Positives").alias("positive_sum") @@ -611,7 +611,7 @@ A lot of volume on the first bins, where the performance is minimal, means that to_plot = "Performance" df = ( datamart.modelData.with_columns(pl.col(to_plot) * 100) - .groupby([to_plot, "Channel", "Direction"]) + .group_by([to_plot, "Channel", "Direction"]) .agg(pl.sum("ResponseCount")) .with_columns(pl.col(to_plot).round(2)) .collect() @@ -622,7 +622,7 @@ df_pl = df.get_column(to_plot).fill_null(0).fill_nan(0).cut(bins=cut_off_value, join = df.lazy().join( df_pl.select([to_plot, "PerformanceBin"]).unique(), on=to_plot, how="left" ) -grouped = join.groupby(["Channel", "PerformanceBin"]).agg(pl.sum("ResponseCount")) +grouped = join.group_by(["Channel", "PerformanceBin"]).agg(pl.sum("ResponseCount")) out = ( grouped.sort(["Channel", "PerformanceBin"]) .select( diff --git a/python/pdstools/utils/cdh_utils.py b/python/pdstools/utils/cdh_utils.py index 9529d3d4..e9e93d6f 100644 --- a/python/pdstools/utils/cdh_utils.py +++ b/python/pdstools/utils/cdh_utils.py @@ -255,7 +255,7 @@ def auc_from_probs( raise Exception("'Groundtruth' has more than two levels.") df = pl.DataFrame({"truth": groundtruth, "probs": probs}) - binned = df.groupby(by="probs").agg( + binned = df.group_by(by="probs").agg( [ (pl.col("truth") == 1).sum().alias("pos"), (pl.col("truth") == 0).sum().alias("neg"), @@ -333,7 +333,7 @@ def aucpr_from_probs( raise Exception("'Groundtruth' has more than two levels.") df = pl.DataFrame({"truth": groundtruth, "probs": probs}) - binned = df.groupby(by="probs").agg( + binned = df.group_by(by="probs").agg( [ (pl.col("truth") == 1).sum().alias("pos"), (pl.col("truth") == 0).sum().alias("neg"), @@ -581,7 +581,7 @@ def zRatio( It represents the number of standard deviations from the avreage, so centers around 0. The wider the spread, the better the predictor is. - To recreate the OOTB ZRatios from the datamart, use in a groupby. + To recreate the OOTB ZRatios from the datamart, use in a group_by. See `examples`. Parameters @@ -593,7 +593,7 @@ def zRatio( Examples -------- - >>> df.groupby(['ModelID', 'PredictorName']).agg([zRatio()]).explode() + >>> df.group_by(['ModelID', 'PredictorName']).agg([zRatio()]).explode() """ def getFracs(posCol=pl.col("BinPositives"), negCol=pl.col("BinNegatives")): diff --git a/python/pdstools/utils/streamlit_utils.py b/python/pdstools/utils/streamlit_utils.py index 03146b9d..6568d47a 100644 --- a/python/pdstools/utils/streamlit_utils.py +++ b/python/pdstools/utils/streamlit_utils.py @@ -254,7 +254,7 @@ def configure_predictor_categorization(): df = ( df.filter(pl.col("PredictorName") != "Classifier") .with_columns((pl.col("PerformanceBin") - 0.5) * 2) - .groupby("PredictorCategory") + .group_by("PredictorCategory") .agg( Performance=cdh_utils.weighed_average_polars( "PerformanceBin", "BinResponseCount" diff --git a/python/pdstools/valuefinder/ValueFinder.py b/python/pdstools/valuefinder/ValueFinder.py index 9609fa26..dfb1b32d 100644 --- a/python/pdstools/valuefinder/ValueFinder.py +++ b/python/pdstools/valuefinder/ValueFinder.py @@ -109,7 +109,7 @@ def __init__( .select(pl.col("pyStage").cat.set_ordering("physical")) .lazy() ) # This pre-fills the stringcache to make the ordering of stages correct - self.maxPropPerCustomer = self.df.groupby(["CustomerID", "pyStage"]).agg( + self.maxPropPerCustomer = self.df.group_by(["CustomerID", "pyStage"]).agg( pl.max("pyModelPropensity").alias("MaxModelPropensity") ) @@ -165,7 +165,7 @@ def getCustomerSummary( df = ( self.df.with_context(th) - .groupby(["CustomerID", "pyStage"]) + .group_by(["CustomerID", "pyStage"]) .agg( [ pl.max("pyPropensity").alias("MaxPropensity"), @@ -207,7 +207,7 @@ def getCountsPerStage( customersummary = self.customersummary df = ( - customersummary.groupby("pyStage") + customersummary.group_by("pyStage") .agg( [ pl.sum("relevantActions"), @@ -266,7 +266,7 @@ def getCountsPerThreshold(self, th, return_df=False) -> Optional[pl.LazyFrame]: ] ) ) - .groupby("pyStage") + .group_by("pyStage") .agg( [ pl.sum("relevantActions"), @@ -669,7 +669,7 @@ def plotFunnelChart(self, level: str = "Action", query=None, return_df=False): df = self.df if query is None else self.df.filter(query) df = ( - df.groupby("pyStage") + df.group_by("pyStage") .agg(pl.col(level).cast(pl.Utf8).value_counts(sort=True)) .explode(level) .unnest(level) diff --git a/python/requirements.txt b/python/requirements.txt index eb76ec0f..c5b026d3 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -3,7 +3,7 @@ plotly>=5.5.0 requests pydot tqdm -polars>=0.18.0,<=0.18.8 +polars==0.19.4 pyarrow pyyaml aioboto3>=11.0 diff --git a/python/tests/test_cdh_utils.py b/python/tests/test_cdh_utils.py index 55dc2bab..4bdb7448 100644 --- a/python/tests/test_cdh_utils.py +++ b/python/tests/test_cdh_utils.py @@ -147,7 +147,7 @@ def test_weighted_average_polars(): } ) output = ( - input.groupby("Channel") + input.group_by("Channel") .agg( cdh_utils.weighed_average_polars("SuccessRate", "ResponseCount").alias( "SuccessRate_weighted" @@ -194,7 +194,7 @@ def test_weighed_performance_polars(): ) output = ( - input.groupby("Channel") + input.group_by("Channel") .agg(cdh_utils.weighed_performance_polars()) .sort("Channel") ) From 9bfd47780067d0407a48668fa412e54b17b6d394 Mon Sep 17 00:00:00 2001 From: "Uyanik, Yusuf" Date: Thu, 28 Sep 2023 11:48:55 +0200 Subject: [PATCH 2/2] Make polars version more flexible --- python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/requirements.txt b/python/requirements.txt index c5b026d3..c0249525 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -3,7 +3,7 @@ plotly>=5.5.0 requests pydot tqdm -polars==0.19.4 +polars>=0.19.0,<=0.19.5 pyarrow pyyaml aioboto3>=11.0