Fix warnings and depreciations from polars v bump

yusufuyanik1 · Dec 20, 2023 · d9a54b8 · d9a54b8
1 parent 3e4f783
commit d9a54b8
Show file tree

Hide file tree

Showing 22 changed files with 283 additions and 129 deletions.
diff --git a/examples/articles/ADMExplained.ipynb b/examples/articles/ADMExplained.ipynb
@@ -641,12 +641,12 @@
                 "        pl.col(\"Contents\").cast(pl.Utf8)\n",
                 "    ).with_columns(\n",
                 "        pl.when(pl.col(\"Type\") == \"numeric\")\n",
-                "        .then(pl.col(\"Contents\").map_elements(lambda col: extract_numbers_in_contents(col, 0)))\n",
+                "        .then(pl.col(\"Contents\").apply(lambda col: extract_numbers_in_contents(col, 0)))\n",
                 "        .otherwise(pl.lit(-9999))\n",
                 "        .alias(\"BinLowerBound\")\n",
                 "        .cast(pl.Float32),\n",
                 "        pl.when(pl.col(\"Type\") == \"numeric\")\n",
-                "        .then(pl.col(\"Contents\").map_elements(lambda col: extract_numbers_in_contents(col, 1)))\n",
+                "        .then(pl.col(\"Contents\").apply(lambda col: extract_numbers_in_contents(col, 1)))\n",
                 "        .otherwise(pl.lit(-9999))\n",
                 "        .alias(\"BinUpperBound\")\n",
                 "        .cast(pl.Float32),\n",
@@ -701,12 +701,12 @@
                 "            pl.col(\"BinSymbol\").alias(\"Bin\"),\n",
                 "            BinPositives.alias(\"Positives\"),\n",
                 "            BinNegatives.alias(\"Negatives\"),\n",
-                "            ((pl.cumsum(\"BinResponseCount\") / pl.sum(\"BinResponseCount\")) * 100).alias(\n",
+                "            ((pl.cum_sum(\"BinResponseCount\") / pl.sum(\"BinResponseCount\")) * 100).alias(\n",
                 "                \"Cum. Total (%)\"\n",
                 "            ),\n",
                 "            (pl.col(\"BinPropensity\") * 100).alias(\"Propensity (%)\"),\n",
                 "            (pl.col(\"AdjustedPropensity\") * 100).alias(\"Adjusted Propensity (%)\"),\n",
-                "            ((pl.cumsum(\"BinPositives\") / pl.sum(\"BinPositives\")) * 100).alias(\n",
+                "            ((pl.cum_sum(\"BinPositives\") / pl.sum(\"BinPositives\")) * 100).alias(\n",
                 "                \"Cum Positives (%)\"\n",
                 "            ),\n",
                 "            pl.col(\"ZRatio\"),\n",
@@ -873,7 +873,7 @@
             "name": "python",
             "nbconvert_exporter": "python",
             "pygments_lexer": "ipython3",
-            "version": "3.11.4"
+            "version": "3.11.5"
         },
         "orig_nbformat": 4
     },

diff --git a/examples/hds/Example_Historical_Dataset_Analysis.ipynb b/examples/hds/Example_Historical_Dataset_Analysis.ipynb
@@ -1111,7 +1111,7 @@
    ],
    "source": [
     "fig, ax = plt.subplots(figsize=(10, 5))\n",
-    "df_plot = df[['Customer_Age', 'Day']].groupby('Day').apply(\n",
+    "df_plot = df[['Customer_Age', 'Day']].groupby('Day').map_elements(\n",
     "    lambda x:x['Customer_Age'].isnull().sum()*100/x.shape[0]).reset_index().sort_values('Day')\n",
     "pal = sns.color_palette(\"Reds_d\", len(df_plot))\n",
     "rank = df_plot[0].argsort()\n",

diff --git a/python/pdstools/__init__.py b/python/pdstools/__init__.py
@@ -4,7 +4,7 @@
 
 from polars import enable_string_cache
 
-enable_string_cache(True)
+enable_string_cache()
 
 import sys
 from pathlib import Path

diff --git a/python/pdstools/adm/ADMDatamart.py b/python/pdstools/adm/ADMDatamart.py
@@ -794,7 +794,7 @@ def _getType(val):
             .group_by(by)
             .agg(pl.col("Modeldata").last())
             .collect()
-            .with_columns(pl.col("Modeldata").apply(lambda v: _getType(v)))
+            .with_columns(pl.col("Modeldata").map_elements(lambda v: _getType(v)))
             .to_dicts()
         )
         return {key: value for key, value in [i.values() for i in types]}
@@ -1082,7 +1082,7 @@ def response_gain_df(df: any_frame, by: str = "Channel") -> any_frame:
             .sort([by, "ResponseCount"], descending=True)
             .with_columns(
                 [
-                    (pl.cumsum("ResponseCount") / pl.sum("ResponseCount"))
+                    (pl.cum_sum("ResponseCount") / pl.sum("ResponseCount"))
                     .over(by)
                     .alias("TotalResponseFraction"),
                     ((pl.col(by).cumcount() + 1) / pl.count("ResponseCount"))
@@ -1121,7 +1121,7 @@ def models_by_positives_df(
         return (
             modelsByPositives.join(
                 modelsByPositives["Positives"].cut(
-                    bins=list(range(0, 210, 10)),
+                    breaks=list(range(0, 210, 10)),
                     series=False,
                     category_label="PositivesBin",
                 ),
@@ -1447,7 +1447,9 @@ def exportTables(self, file: Path = "Tables.xlsx", predictorBinning=False):
                 for tab in self.ApplicableTablesNoPredictorBinning
             }
 
-        with Workbook(file, {"nan_inf_to_errors": True}) as wb:
+        with Workbook(
+            file, options={"nan_inf_to_errors": True, "remove_timezone": True}
+        ) as wb:
             for tab, data in tabs.items():
                 data = data.with_columns(
                     pl.col(pl.List(pl.Categorical), pl.List(pl.Utf8))

diff --git a/python/pdstools/adm/ADMTrees.py b/python/pdstools/adm/ADMTrees.py
@@ -54,7 +54,7 @@ def getMultiTrees(file: pl.DataFrame, n_threads=1, verbose=True, **kwargs):
             pl.col("SnapshotTime")
             .dt.round("1s")
             .cast(pl.Utf8)
-            .str.rstrip(".000000000"),
+            .str.strip_chars_end(".000000000"),
             pl.col("Modeldata").str.decode("base64"),
             pl.col("Configuration").cast(pl.Utf8),
         )
@@ -290,7 +290,9 @@ def _post_import_cleanup(self, decode, **kwargs):
                         self.model = self.trees["model"]["booster"]["trees"]
                     except Exception as e3:
                         try:
-                            self.model = self.trees["model"]["model"]["booster"]["trees"]
+                            self.model = self.trees["model"]["model"]["booster"][
+                                "trees"
+                            ]
                         except Exception as e4:
                             raise (e1, e2, e3, e4)
 
@@ -473,7 +475,9 @@ def getGainsPerSplit(self) -> Tuple[Dict, pl.DataFrame, dict]:
             list(zip(total_split_list, total_gains_list)), schema=["split", "gains"]
         )
         gainsPerSplit = gainsPerSplit.with_columns(
-            predictor=pl.col("split").map_elements(lambda x: self.parseSplitValues(x)[0])
+            predictor=pl.col("split").map_elements(
+                lambda x: self.parseSplitValues(x)[0]
+            )
         )
         return splitsPerTree, gainsPerTree, gainsPerSplit
 
@@ -498,7 +502,7 @@ def getGroupedGainsPerSplit(self) -> pl.DataFrame:
                     .alias("values"),
                 ]
             )
-            .with_columns(n=pl.col("gains").list.lengths())
+            .with_columns(n=pl.col("gains").list.len())
         )
 
     def getSplitsRecursively(
@@ -1046,9 +1050,7 @@ def computeOverTime(self, predictorCategorization=None):
                 to_plot = tree.computeCategorizationOverTime(predictorCategorization)[0]
             outdf.append(
                 pl.DataFrame(to_plot).with_columns(
-                    SnapshotTime=pl.lit(timestamp).str.to_date(
-                        format="%Y-%m-%d %X"
-                    )
+                    SnapshotTime=pl.lit(timestamp).str.to_date(format="%Y-%m-%d %X")
                 )
             )
 

diff --git a/python/pdstools/app/pages/3- Reports.py → python/pdstools/app/pages/3-Reports.py b/python/pdstools/app/pages/3- Reports.py → python/pdstools/app/pages/3-Reports.py
diff --git a/python/pdstools/app/pages/__init__.py b/python/pdstools/app/pages/__init__.py
diff --git a/python/pdstools/ih/legacy_IH.py b/python/pdstools/ih/legacy_IH.py
@@ -134,13 +134,13 @@ def plot_daily_cumulative_accept_rate(df, pos, neg, **kwargs):
     _df, rollup, hue = get_accept_rate_time(df, pos, neg, "Date", **kwargs)
 
     if "hue" in kwargs.keys():
-        _df["Total_cum"] = _df.group_by(hue)["Total"].apply(lambda x: x.cumsum())
-        _df["Accepted_cum"] = _df.group_by(hue)["Accepted"].apply(lambda x: x.cumsum())
+        _df["Total_cum"] = _df.group_by(hue)["Total"].map_elements(lambda x: x.cum_sum())
+        _df["Accepted_cum"] = _df.group_by(hue)["Accepted"].map_elements(lambda x: x.cum_sum())
         _df["hue"] = _df[hue].agg("__".join, axis=1)
         kwargs["hue"] = "hue"
     else:
-        _df["Total_cum"] = _df["Total"].cumsum()
-        _df["Accepted_cum"] = _df["Accepted"].cumsum()
+        _df["Total_cum"] = _df["Total"].cum_sum()
+        _df["Accepted_cum"] = _df["Accepted"].cum_sum()
     _df["Cumulative Accept Rate (%)"] = _df["Accepted_cum"] * 100 / _df["Total_cum"]
 
     if "allTime" in kwargs.keys():

diff --git a/python/pdstools/pega_io/API.py b/python/pdstools/pega_io/API.py
@@ -7,9 +7,9 @@ def _readClientCredentialFile(credentialFile):  # pragma: no cover
     with open(credentialFile) as f:
         for idx, line in enumerate(f.readlines()):
             if (idx % 2) == 0:
-                key = line.rstrip("\n")
+                key = line.strip_chars_end("\n")
             else:
-                outputdict[key] = line.rstrip("\n")
+                outputdict[key] = line.strip_chars_end("\n")
         return outputdict
 
 
@@ -50,7 +50,7 @@ def get_token(credentialFile: str, verify: bool = True, **kwargs):  # pragma: no
         verify=verify,
     ).json()
     if "errors" in response:
-        raise ConnectionRefusedError(f"Error when connecting to infinity: {e}")
+        raise ConnectionRefusedError(f"Error when connecting to infinity: {response}")
     return response["access_token"]
 
 
@@ -64,7 +64,7 @@ def setupAzureOpenAI(
         "2023-07-01-preview",
         "2023-09-15-preview",
         "2023-10-01-preview",
-        "2023-12-01-preview"
+        "2023-12-01-preview",
     ] = "2023-12-01-preview",
 ):
     """Convenience function to automagically setup Azure AD-based authentication

diff --git a/python/pdstools/plots/plot_base.py b/python/pdstools/plots/plot_base.py
@@ -777,7 +777,9 @@ def plotPredictorPerformance(
         """Plots a bar chart of the performance of the predictors
 
         By default, this plot shows the performance over all models
-        Use the querying functionality to drill down into a more specific subset
+        Use the querying functionality to drill down into a more specific subset.
+        Picks top n predictors with highest weighted average Performance accross
+        models and then sorts the predictors according to the median value.
 
         Parameters
         ----------

diff --git a/python/pdstools/reports/HealthCheck.qmd b/python/pdstools/reports/HealthCheck.qmd
@@ -150,6 +150,7 @@ elif len(datafolder) > 0 or len(modelfilename) > 0 or len(predictorfilename) > 0
         path="." if len(datafolder) == 0 else datafolder,
         model_filename="" if len(modelfilename) == 0 else modelfilename,
         predictor_filename="" if len(predictorfilename) == 0 else predictorfilename,
+        extract_keys=True,
         include_cols="pyFeatureImportance",
     ).fillMissing()
 else:
@@ -1575,7 +1576,7 @@ Very skewed results may be caused by prioritization elements like levers and wei
 #     by_as_list = by if isinstance(by,list) else [by]
 #     sortExpr = by_as_list + [sortExpr]
 
-# indexExpr = (pl.int_range(1, pl.count() + 1)/ pl.count()) if index is None else (pl.cumsum(index) / pl.sum(index))
+# indexExpr = (pl.int_range(1, pl.count() + 1)/ pl.count()) if index is None else (pl.cum_sum(index) / pl.sum(index))
 
 # gains_df = (
 #     df.lazy()
@@ -1584,7 +1585,7 @@ Very skewed results may be caused by prioritization elements like levers and wei
 #     .select(
 #         by_as_list +
 #         [indexExpr.over(by).cast(pl.Float64).alias("cum_x"),
-#         (pl.cumsum(value) / pl.sum(value)).over(by).cast(pl.Float64).alias("cum_y")]
+#         (pl.cum_sum(value) / pl.sum(value)).over(by).cast(pl.Float64).alias("cum_y")]
 #     )
 # )
 # # Add entry for the (0,0) point
@@ -1620,7 +1621,7 @@ def gains_table(df, value: str, index=None, by=None):
     indexExpr = (
         (pl.int_range(1, pl.count() + 1) / pl.count())
         if index is None
-        else (pl.cumsum(index) / pl.sum(index))
+        else (pl.cum_sum(index) / pl.sum(index))
     )
 
     if by is None:
@@ -1631,7 +1632,7 @@ def gains_table(df, value: str, index=None, by=None):
                 .sort(sortExpr, descending=True)
                 .select(
                     indexExpr.cast(pl.Float64).alias("cum_x"),
-                    (pl.cumsum(value) / pl.sum(value)).cast(pl.Float64).alias("cum_y"),
+                    (pl.cum_sum(value) / pl.sum(value)).cast(pl.Float64).alias("cum_y"),
                 ),
             ]
         )
@@ -1645,7 +1646,7 @@ def gains_table(df, value: str, index=None, by=None):
                 by_as_list
                 + [
                     indexExpr.over(by).cast(pl.Float64).alias("cum_x"),
-                    (pl.cumsum(value) / pl.sum(value))
+                    (pl.cum_sum(value) / pl.sum(value))
                     .over(by)
                     .cast(pl.Float64)
                     .alias("cum_y"),

diff --git a/python/pdstools/reports/ModelReport.qmd b/python/pdstools/reports/ModelReport.qmd
@@ -260,12 +260,12 @@ human_friendly_scoredistribution = (
         pl.col("BinSymbol").alias("Bin"),
         pl.col("BinResponseCount").alias("Responses"),
         pl.col("BinPositives").alias("Positives"),
-        (100 * (pl.col("BinPositives").cumsum(reverse=True)) / pl.sum("BinPositives"))
+        (100 * (pl.col("BinPositives").cum_sum(reverse=True)) / pl.sum("BinPositives"))
         .round(2)
         .alias("Cum. Positives (%)"),
         (
             100
-            * (pl.col("BinResponseCount").cumsum(reverse=True))
+            * (pl.col("BinResponseCount").cum_sum(reverse=True))
             / pl.sum("BinResponseCount")
         )
         .round(2)